From 42a146e06067d386b6290d78711c28cadf0fb2a8 Mon Sep 17 00:00:00 2001 From: koarz Date: Mon, 7 Jul 2025 16:41:48 +0800 Subject: [PATCH 1/7] [enhance](meta-service)add bvar for fdb process status --- cloud/src/common/bvars.cpp | 2 + cloud/src/common/bvars.h | 2 + cloud/src/common/metric.cpp | 81 +++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) diff --git a/cloud/src/common/bvars.cpp b/cloud/src/common/bvars.cpp index 3ca961afffb725..1e9e7c4ede4a01 100644 --- a/cloud/src/common/bvars.cpp +++ b/cloud/src/common/bvars.cpp @@ -211,6 +211,8 @@ bvar::Status g_bvar_fdb_workload_transactions_started_hz("fdb_workload_ bvar::Status g_bvar_fdb_workload_transactions_committed_hz("fdb_workload_transactions_committed_hz", BVAR_FDB_INVALID_VALUE); bvar::Status g_bvar_fdb_workload_transactions_rejected_hz("fdb_workload_transactions_rejected_hz", BVAR_FDB_INVALID_VALUE); bvar::Status g_bvar_fdb_client_thread_busyness_percent("fdb_client_thread_busyness_percent", BVAR_FDB_INVALID_VALUE); +mBvarStatus g_bvar_fdb_process_status_int("fdb_process_status_int", {"process_id", "component", "metric"}); +mBvarStatus g_bvar_fdb_process_status_float("fdb_process_status_float", {"process_id", "component", "metric"}); // checker's bvars BvarStatusWithTag g_bvar_checker_num_scanned("checker", "num_scanned"); diff --git a/cloud/src/common/bvars.h b/cloud/src/common/bvars.h index 6034afe7112e32..d9dfb544d1ae33 100644 --- a/cloud/src/common/bvars.h +++ b/cloud/src/common/bvars.h @@ -348,6 +348,8 @@ extern bvar::Status g_bvar_fdb_workload_transactions_started_hz; extern bvar::Status g_bvar_fdb_workload_transactions_committed_hz; extern bvar::Status g_bvar_fdb_workload_transactions_rejected_hz; extern bvar::Status g_bvar_fdb_client_thread_busyness_percent; +extern mBvarStatus g_bvar_fdb_process_status_int; +extern mBvarStatus g_bvar_fdb_process_status_float; // checker extern BvarStatusWithTag g_bvar_checker_num_scanned; diff --git a/cloud/src/common/metric.cpp b/cloud/src/common/metric.cpp index a9b91c6c853ccd..505f695ee1a535 100644 --- a/cloud/src/common/metric.cpp +++ b/cloud/src/common/metric.cpp @@ -17,10 +17,12 @@ #include "metric.h" +#include #include #include #include +#include #include #include #include @@ -28,6 +30,7 @@ #include #include "common/bvars.h" +#include "common/logging.h" #include "meta-store/txn_kv.h" #include "meta-store/txn_kv_error.h" @@ -134,6 +137,79 @@ static void export_fdb_status_details(const std::string& status_str) { DCHECK(node->value.IsDouble()); return static_cast(node->value.GetDouble() * NANOSECONDS); }; + auto get_process_metric = [&](std::string component) { + class RecursiveNameHelper { + public: + explicit RecursiveNameHelper(std::string name) : name_(std::move(name)) {} + + RecursiveNameHelper next_level_name(std::string name) const { + return RecursiveNameHelper(name_ + '_' + name); + } + + std::string& get_name() { return name_; } + + private: + std::string name_; + }; + auto node = document.FindMember("cluster"); + if (!node->value.HasMember("processes")) return; + node = node->value.FindMember("processes"); + // process + for (auto process_node = node->value.MemberBegin(); process_node != node->value.MemberEnd(); + process_node++) { + const char* process_id = process_node->name.GetString(); + decltype(process_node) component_node; + // get component iter + if (!process_node->value.HasMember(component.data())) { + component_node = process_node->value.FindMember(component.data()); + } + // There are three cases here: int64, double, and object. + // If it is double or int64, put it directly into the bvar. + // If it is an object, recursively obtain the full name and corresponding value. + // such as: {"disk": {"reads": {"counter": 123, "hz": 0}}} + // component is "disk", the names of these two values should be "reads_counter" and "reads_hz" + // proved two type lambda func to handle object and other type + + // set_bvar_value is responsible for setting integer and float values to the corresponding bvar. + auto set_bvar_value = [&process_id, &component]( + RecursiveNameHelper& name_helper, + decltype(process_node)& temp_node) -> void { + if (temp_node->value.IsInt64()) { + g_bvar_fdb_process_status_int.put( + {process_id, component, name_helper.get_name()}, + temp_node->value.GetInt64()); + return; + } + if (temp_node->value.IsDouble()) { + g_bvar_fdb_process_status_float.put( + {process_id, component, name_helper.get_name()}, + temp_node->value.GetDouble()); + return; + } + LOG(WARNING) << fmt::format( + "Get process metrics set_bvar_value input a wrong type node {}", + name_helper.get_name()); + }; + auto object_recursive = [&set_bvar_value](auto&& self, RecursiveNameHelper name_helper, + decltype(process_node) temp_node) -> void { + if (temp_node->value.IsObject()) { + for (auto iter = temp_node->value.MemberBegin(); + iter != temp_node->value.MemberEnd(); iter++) { + self(self, name_helper.next_level_name(iter->name.GetString()), iter); + } + } + // Note that the parameter passed to set_bvar_value here is the current node, not its Member. + // so we can directly call object_recursive in the loop below(metric_node). + // if the node is a object, then get Member(iter) and recursive with iter as arg + set_bvar_value(name_helper, temp_node); + }; + for (auto metric_node = component_node->value.MemberBegin(); + metric_node != component_node->value.MemberEnd(); metric_node++) { + object_recursive(object_recursive, + RecursiveNameHelper(metric_node->name.GetString()), metric_node); + } + } + }; // Configuration g_bvar_fdb_configuration_coordinators_count.set_value( get_value({"configuration", "coordinators_count"})); @@ -226,6 +302,11 @@ static void export_fdb_status_details(const std::string& status_str) { } } } + + // Process Status + get_process_metric("cpu"); + get_process_metric("disk"); + get_process_metric("memory"); } void FdbMetricExporter::export_fdb_metrics(TxnKv* txn_kv) { From df7b7c584370d718874573d105c14373ffc259f1 Mon Sep 17 00:00:00 2001 From: koarz Date: Mon, 7 Jul 2025 17:01:18 +0800 Subject: [PATCH 2/7] fix --- cloud/src/common/metric.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cloud/src/common/metric.cpp b/cloud/src/common/metric.cpp index 505f695ee1a535..a392e2d33471a8 100644 --- a/cloud/src/common/metric.cpp +++ b/cloud/src/common/metric.cpp @@ -160,9 +160,8 @@ static void export_fdb_status_details(const std::string& status_str) { const char* process_id = process_node->name.GetString(); decltype(process_node) component_node; // get component iter - if (!process_node->value.HasMember(component.data())) { - component_node = process_node->value.FindMember(component.data()); - } + if (!process_node->value.HasMember(component.data())) return; + component_node = process_node->value.FindMember(component.data()); // There are three cases here: int64, double, and object. // If it is double or int64, put it directly into the bvar. // If it is an object, recursively obtain the full name and corresponding value. From 0ecdc144f470dee8c9e67a55f6feec00dcb040a3 Mon Sep 17 00:00:00 2001 From: koarz Date: Wed, 9 Jul 2025 10:31:27 +0800 Subject: [PATCH 3/7] fix --- cloud/src/common/metric.cpp | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/cloud/src/common/metric.cpp b/cloud/src/common/metric.cpp index a392e2d33471a8..b01e0ed08e7c15 100644 --- a/cloud/src/common/metric.cpp +++ b/cloud/src/common/metric.cpp @@ -167,45 +167,46 @@ static void export_fdb_status_details(const std::string& status_str) { // If it is an object, recursively obtain the full name and corresponding value. // such as: {"disk": {"reads": {"counter": 123, "hz": 0}}} // component is "disk", the names of these two values should be "reads_counter" and "reads_hz" + auto recursive_name_helper = [](std::string& origin_name, + const char* next_level_name) -> std::string { + return origin_name + '_' + next_level_name; + }; // proved two type lambda func to handle object and other type // set_bvar_value is responsible for setting integer and float values to the corresponding bvar. auto set_bvar_value = [&process_id, &component]( - RecursiveNameHelper& name_helper, + std::string& name, decltype(process_node)& temp_node) -> void { if (temp_node->value.IsInt64()) { - g_bvar_fdb_process_status_int.put( - {process_id, component, name_helper.get_name()}, - temp_node->value.GetInt64()); + g_bvar_fdb_process_status_int.put({process_id, component, name}, + temp_node->value.GetInt64()); return; } if (temp_node->value.IsDouble()) { - g_bvar_fdb_process_status_float.put( - {process_id, component, name_helper.get_name()}, - temp_node->value.GetDouble()); + g_bvar_fdb_process_status_float.put({process_id, component, name}, + temp_node->value.GetDouble()); return; } LOG(WARNING) << fmt::format( - "Get process metrics set_bvar_value input a wrong type node {}", - name_helper.get_name()); + "Get process metrics set_bvar_value input a wrong type node {}", name); }; - auto object_recursive = [&set_bvar_value](auto&& self, RecursiveNameHelper name_helper, - decltype(process_node) temp_node) -> void { + auto object_recursive = [&set_bvar_value, &recursive_name_helper]( + auto&& self, std::string name, + decltype(process_node) temp_node) -> void { if (temp_node->value.IsObject()) { for (auto iter = temp_node->value.MemberBegin(); iter != temp_node->value.MemberEnd(); iter++) { - self(self, name_helper.next_level_name(iter->name.GetString()), iter); + self(self, recursive_name_helper(name, iter->name.GetString()), iter); } } // Note that the parameter passed to set_bvar_value here is the current node, not its Member. // so we can directly call object_recursive in the loop below(metric_node). // if the node is a object, then get Member(iter) and recursive with iter as arg - set_bvar_value(name_helper, temp_node); + set_bvar_value(name, temp_node); }; for (auto metric_node = component_node->value.MemberBegin(); metric_node != component_node->value.MemberEnd(); metric_node++) { - object_recursive(object_recursive, - RecursiveNameHelper(metric_node->name.GetString()), metric_node); + object_recursive(object_recursive, metric_node->name.GetString(), metric_node); } } }; From 47c856ef4c926ba344ba9988733f4acbc9268c7e Mon Sep 17 00:00:00 2001 From: koarz Date: Wed, 9 Jul 2025 10:32:15 +0800 Subject: [PATCH 4/7] fix --- cloud/src/common/metric.cpp | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/cloud/src/common/metric.cpp b/cloud/src/common/metric.cpp index b01e0ed08e7c15..2ba84abf30b969 100644 --- a/cloud/src/common/metric.cpp +++ b/cloud/src/common/metric.cpp @@ -138,19 +138,6 @@ static void export_fdb_status_details(const std::string& status_str) { return static_cast(node->value.GetDouble() * NANOSECONDS); }; auto get_process_metric = [&](std::string component) { - class RecursiveNameHelper { - public: - explicit RecursiveNameHelper(std::string name) : name_(std::move(name)) {} - - RecursiveNameHelper next_level_name(std::string name) const { - return RecursiveNameHelper(name_ + '_' + name); - } - - std::string& get_name() { return name_; } - - private: - std::string name_; - }; auto node = document.FindMember("cluster"); if (!node->value.HasMember("processes")) return; node = node->value.FindMember("processes"); From 224008220848684d5a97bb8f3625a75756a84ca1 Mon Sep 17 00:00:00 2001 From: koarz Date: Wed, 9 Jul 2025 10:35:07 +0800 Subject: [PATCH 5/7] comment --- cloud/src/common/metric.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cloud/src/common/metric.cpp b/cloud/src/common/metric.cpp index 2ba84abf30b969..0bd3bc41e73b4e 100644 --- a/cloud/src/common/metric.cpp +++ b/cloud/src/common/metric.cpp @@ -180,17 +180,18 @@ static void export_fdb_status_details(const std::string& status_str) { auto object_recursive = [&set_bvar_value, &recursive_name_helper]( auto&& self, std::string name, decltype(process_node) temp_node) -> void { + // if the node is an object, then get Member(iter) and recursive with iter as arg if (temp_node->value.IsObject()) { for (auto iter = temp_node->value.MemberBegin(); iter != temp_node->value.MemberEnd(); iter++) { self(self, recursive_name_helper(name, iter->name.GetString()), iter); } } - // Note that the parameter passed to set_bvar_value here is the current node, not its Member. - // so we can directly call object_recursive in the loop below(metric_node). - // if the node is a object, then get Member(iter) and recursive with iter as arg + // if not object, set bvar value set_bvar_value(name, temp_node); }; + // Note that the parameter passed to set_bvar_value here is the current node, not its Member + // so we can directly call object_recursive in the loop for (auto metric_node = component_node->value.MemberBegin(); metric_node != component_node->value.MemberEnd(); metric_node++) { object_recursive(object_recursive, metric_node->name.GetString(), metric_node); From 5eb5d8e03d9331fa279feada2461de970fcc5e93 Mon Sep 17 00:00:00 2001 From: koarz Date: Thu, 10 Jul 2025 10:24:00 +0800 Subject: [PATCH 6/7] test --- cloud/test/metric_test.cpp | 125 +++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/cloud/test/metric_test.cpp b/cloud/test/metric_test.cpp index 31a2b7b3c5821f..e1aec84d0d22e8 100644 --- a/cloud/test/metric_test.cpp +++ b/cloud/test/metric_test.cpp @@ -172,4 +172,129 @@ TEST(MetricTest, FdbMetricExporterTest) { ASSERT_EQ(g_bvar_fdb_machines_count.get_value(), BVAR_FDB_INVALID_VALUE); ASSERT_EQ(g_bvar_fdb_client_count.get_value(), BVAR_FDB_INVALID_VALUE); } + + // process status + { + g_bvar_fdb_machines_count.set_value(BVAR_FDB_INVALID_VALUE); + g_bvar_fdb_client_count.set_value(BVAR_FDB_INVALID_VALUE); + + std::string fdb_metric_example = "./fdb_metric_example.json"; + std::ifstream inFile(fdb_metric_example); + + ASSERT_TRUE(inFile.is_open()); + std::string fileContent((std::istreambuf_iterator(inFile)), + std::istreambuf_iterator()); + + std::string word_to_replace = "cluster"; + std::string new_word = "xxxx"; + + size_t start_pos = 0; + while ((start_pos = fileContent.find(word_to_replace, start_pos)) != std::string::npos) { + fileContent.replace(start_pos, word_to_replace.length(), new_word); + start_pos += new_word.length(); + } + std::shared_ptr txn_kv = std::make_shared(); + std::unique_ptr txn; + ASSERT_EQ(txn_kv->create_txn(&txn), TxnErrorCode::TXN_OK); + txn->put("\xff\xff/status/json", fileContent); + ASSERT_EQ(txn->commit(), TxnErrorCode::TXN_OK); + + FdbMetricExporter fdb_metric_exporter(txn_kv); + fdb_metric_exporter.sleep_interval_ms_ = 1; + fdb_metric_exporter.start(); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + fdb_metric_exporter.stop(); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"09ca90b9f3f413e5816b2610ed8b465d", "cpu", "usage_cores"}), + 0.0012292); + ASSERT_EQ(g_bvar_fdb_process_status_float.get( + {"09ca90b9f3f413e5816b2610ed8b465d", "disk", "busy"}), + 0.0085999800000000001); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"09ca90b9f3f413e5816b2610ed8b465d", "disk", "free_bytes"}), + 490412584960); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"09ca90b9f3f413e5816b2610ed8b465d", "disk", "reads_counter"}), + 854857); + ASSERT_EQ(g_bvar_fdb_process_status_float.get( + {"09ca90b9f3f413e5816b2610ed8b465d", "disk", "reads_hz"}), + 0); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"09ca90b9f3f413e5816b2610ed8b465d", "disk", "reads_sectors"}), + 0); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"09ca90b9f3f413e5816b2610ed8b465d", "disk", "total_bytes"}), + 527295578112); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"09ca90b9f3f413e5816b2610ed8b465d", "disk", "writes_counter"}), + 73765457); + ASSERT_EQ(g_bvar_fdb_process_status_float.get( + {"09ca90b9f3f413e5816b2610ed8b465d", "disk", "writes_hz"}), + 26.1999); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"09ca90b9f3f413e5816b2610ed8b465d", "disk", "writes_sectors"}), + 1336); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"09ca90b9f3f413e5816b2610ed8b465d", "memory", "available_bytes"}), + 3065090867); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"09ca90b9f3f413e5816b2610ed8b465d", "memory", "limit_bytes"}), + 8589934592); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"09ca90b9f3f413e5816b2610ed8b465d", "memory", "rss_bytes"}), + 46551040); + ASSERT_EQ(g_bvar_fdb_process_status_int.get({"09ca90b9f3f413e5816b2610ed8b465d", "memory", + "unused_allocated_memory"}), + 655360); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"09ca90b9f3f413e5816b2610ed8b465d", "memory", "used_bytes"}), + 122974208); + + // test second process + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"0a456165f04e1ec1a2ade0ce523d54a8", "cpu", "usage_cores"}), + 0.0049765900000000004); + ASSERT_EQ(g_bvar_fdb_process_status_float.get( + {"0a456165f04e1ec1a2ade0ce523d54a8", "disk", "busy"}), + 0.012200000000000001); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"0a456165f04e1ec1a2ade0ce523d54a8", "disk", "free_bytes"}), + 489160159232); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"0a456165f04e1ec1a2ade0ce523d54a8", "disk", "reads_counter"}), + 877107); + ASSERT_EQ(g_bvar_fdb_process_status_float.get( + {"0a456165f04e1ec1a2ade0ce523d54a8", "disk", "reads_hz"}), + 0); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"0a456165f04e1ec1a2ade0ce523d54a8", "disk", "reads_sectors"}), + 0); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"0a456165f04e1ec1a2ade0ce523d54a8", "disk", "total_bytes"}), + 527295578112); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"0a456165f04e1ec1a2ade0ce523d54a8", "disk", "writes_counter"}), + 79316112); + ASSERT_EQ(g_bvar_fdb_process_status_float.get( + {"0a456165f04e1ec1a2ade0ce523d54a8", "disk", "writes_hz"}), + 30.9999); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"0a456165f04e1ec1a2ade0ce523d54a8", "disk", "writes_sectors"}), + 744); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"0a456165f04e1ec1a2ade0ce523d54a8", "memory", "available_bytes"}), + 3076787404); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"0a456165f04e1ec1a2ade0ce523d54a8", "memory", "limit_bytes"}), + 8589934592); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"0a456165f04e1ec1a2ade0ce523d54a8", "memory", "rss_bytes"}), + 72359936); + ASSERT_EQ(g_bvar_fdb_process_status_int.get({"0a456165f04e1ec1a2ade0ce523d54a8", "memory", + "unused_allocated_memory"}), + 393216); + ASSERT_EQ(g_bvar_fdb_process_status_int.get( + {"0a456165f04e1ec1a2ade0ce523d54a8", "memory", "used_bytes"}), + 157978624); + } } \ No newline at end of file From d1c2bad5a20ff211ee0618afffd0b9c74f5c73eb Mon Sep 17 00:00:00 2001 From: koarz Date: Thu, 10 Jul 2025 10:55:29 +0800 Subject: [PATCH 7/7] fix --- cloud/src/common/metric.cpp | 1 + cloud/test/metric_test.cpp | 12 ++---------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/cloud/src/common/metric.cpp b/cloud/src/common/metric.cpp index 0bd3bc41e73b4e..124a5f26a063af 100644 --- a/cloud/src/common/metric.cpp +++ b/cloud/src/common/metric.cpp @@ -186,6 +186,7 @@ static void export_fdb_status_details(const std::string& status_str) { iter != temp_node->value.MemberEnd(); iter++) { self(self, recursive_name_helper(name, iter->name.GetString()), iter); } + return; } // if not object, set bvar value set_bvar_value(name, temp_node); diff --git a/cloud/test/metric_test.cpp b/cloud/test/metric_test.cpp index e1aec84d0d22e8..81174c73924de9 100644 --- a/cloud/test/metric_test.cpp +++ b/cloud/test/metric_test.cpp @@ -185,14 +185,6 @@ TEST(MetricTest, FdbMetricExporterTest) { std::string fileContent((std::istreambuf_iterator(inFile)), std::istreambuf_iterator()); - std::string word_to_replace = "cluster"; - std::string new_word = "xxxx"; - - size_t start_pos = 0; - while ((start_pos = fileContent.find(word_to_replace, start_pos)) != std::string::npos) { - fileContent.replace(start_pos, word_to_replace.length(), new_word); - start_pos += new_word.length(); - } std::shared_ptr txn_kv = std::make_shared(); std::unique_ptr txn; ASSERT_EQ(txn_kv->create_txn(&txn), TxnErrorCode::TXN_OK); @@ -204,7 +196,7 @@ TEST(MetricTest, FdbMetricExporterTest) { fdb_metric_exporter.start(); std::this_thread::sleep_for(std::chrono::milliseconds(10)); fdb_metric_exporter.stop(); - ASSERT_EQ(g_bvar_fdb_process_status_int.get( + ASSERT_EQ(g_bvar_fdb_process_status_float.get( {"09ca90b9f3f413e5816b2610ed8b465d", "cpu", "usage_cores"}), 0.0012292); ASSERT_EQ(g_bvar_fdb_process_status_float.get( @@ -251,7 +243,7 @@ TEST(MetricTest, FdbMetricExporterTest) { 122974208); // test second process - ASSERT_EQ(g_bvar_fdb_process_status_int.get( + ASSERT_EQ(g_bvar_fdb_process_status_float.get( {"0a456165f04e1ec1a2ade0ce523d54a8", "cpu", "usage_cores"}), 0.0049765900000000004); ASSERT_EQ(g_bvar_fdb_process_status_float.get(