From 2b40a08c7c1449fa83c66354bd6b8e9c7d149051 Mon Sep 17 00:00:00 2001 From: "David B. Tucker" Date: Thu, 19 Jul 2018 18:16:28 -0400 Subject: [PATCH 1/7] RFC: Add /healthz endpoint that returns 500 when some watch stream is stale. --- src/agent.cc | 2 +- src/api_server.cc | 34 +++++++++++++++++++++++++++++++++- src/api_server.h | 10 ++++++++-- src/health_checker.cc | 21 ++++++++++++++++++++- src/health_checker.h | 9 +++++++-- src/kubernetes.cc | 21 ++++++++++++++++++++- 6 files changed, 89 insertions(+), 8 deletions(-) diff --git a/src/agent.cc b/src/agent.cc index 77f9717b..14fdae8e 100644 --- a/src/agent.cc +++ b/src/agent.cc @@ -30,7 +30,7 @@ MetadataAgent::~MetadataAgent() {} void MetadataAgent::Start() { metadata_api_server_.reset(new MetadataApiServer( - config_, store_, config_.MetadataApiNumThreads(), "0.0.0.0", + config_, &health_checker_, store_, config_.MetadataApiNumThreads(), "0.0.0.0", config_.MetadataApiPort())); reporter_.reset(new MetadataReporter( config_, &store_, config_.MetadataReporterIntervalSeconds())); diff --git a/src/api_server.cc b/src/api_server.cc index 84036935..e260dae0 100644 --- a/src/api_server.cc +++ b/src/api_server.cc @@ -19,6 +19,7 @@ #include #include "configuration.h" +#include "health_checker.h" #include "http_common.h" #include "logging.h" #include "store.h" @@ -67,15 +68,22 @@ void MetadataApiServer::Dispatcher::log(const HttpServer::string_type& info) con MetadataApiServer::MetadataApiServer(const Configuration& config, + HealthChecker* health_checker, const MetadataStore& store, int server_threads, const std::string& host, int port) - : config_(config), store_(store), dispatcher_({ + : config_(config), health_checker_(health_checker), store_(store), + dispatcher_({ {{"GET", "/monitoredResource/"}, [=](const HttpServer::request& request, std::shared_ptr conn) { HandleMonitoredResource(request, conn); }}, + {{"GET", "/healthz"}, + [=](const HttpServer::request& request, + std::shared_ptr conn) { + HandleHealthz(request, conn); + }}, }, config_.VerboseLogging()), server_( HttpServer::options(dispatcher_) @@ -139,4 +147,28 @@ void MetadataApiServer::HandleMonitoredResource( } } +void MetadataApiServer::HandleHealthz( + const HttpServer::request& request, + std::shared_ptr conn) { + if (health_checker_->IsHealthy()) { + if (config_.VerboseLogging()) { + LOG(INFO) << "Healthz returning 200"; + } + conn->set_status(HttpServer::connection::ok); + conn->set_headers(std::map({ + {"Content-Type", "text/plain"}, + })); + conn->write("healthy"); + } else { + if (config_.VerboseLogging()) { + LOG(WARNING) << "Healthz returning 500"; + } + conn->set_status(HttpServer::connection::internal_server_error); + conn->set_headers(std::map({ + {"Content-Type", "text/plain"}, + })); + conn->write("unhealthy"); + } +} + } diff --git a/src/api_server.h b/src/api_server.h index b700a27b..b7195ef1 100644 --- a/src/api_server.h +++ b/src/api_server.h @@ -33,14 +33,17 @@ namespace google { // Configuration object. class Configuration; +class HealthChecker; + // Storage for the metadata mapping. class MetadataStore; // A server that implements the metadata agent API. class MetadataApiServer { public: - MetadataApiServer(const Configuration& config, const MetadataStore& store, - int server_threads, const std::string& host, int port); + MetadataApiServer(const Configuration& config, HealthChecker* health_checker, + const MetadataStore& store, int server_threads, + const std::string& host, int port); ~MetadataApiServer(); // Stops the server and closes the listening socket. @@ -71,8 +74,11 @@ class MetadataApiServer { // Handler functions. void HandleMonitoredResource(const HttpServer::request& request, std::shared_ptr conn); + void HandleHealthz(const HttpServer::request& request, + std::shared_ptr conn); const Configuration& config_; + HealthChecker* health_checker_; const MetadataStore& store_; Dispatcher dispatcher_; HttpServer server_; diff --git a/src/health_checker.cc b/src/health_checker.cc index 2b783964..24d18808 100644 --- a/src/health_checker.cc +++ b/src/health_checker.cc @@ -39,7 +39,15 @@ void HealthChecker::SetUnhealthy(const std::string& component) { bool HealthChecker::IsHealthy() const { std::lock_guard lock(mutex_); - return unhealthy_components_.empty(); + if (!unhealthy_components_.empty()) { + return false; + } + for (auto& c : component_callbacks_) { + if (!c.second()) { + return false; + } + } + return true; } void HealthChecker::CleanupForTest() { @@ -47,4 +55,15 @@ void HealthChecker::CleanupForTest() { config_.HealthCheckFile()).parent_path()); } +void HealthChecker::RegisterCallback(const std::string& component, + std::function callback) { + std::lock_guard lock(mutex_); + component_callbacks_[component] = callback; +} + +void HealthChecker::UnregisterCallback(const std::string& component) { + std::lock_guard lock(mutex_); + component_callbacks_.erase(component); +} + } // namespace google diff --git a/src/health_checker.h b/src/health_checker.h index 0afb107b..93446249 100644 --- a/src/health_checker.h +++ b/src/health_checker.h @@ -16,9 +16,10 @@ #ifndef HEALTH_CHECKER_H_ #define HEALTH_CHECKER_H_ -#include +#include #include #include +#include #include "configuration.h" @@ -29,15 +30,19 @@ class HealthChecker { public: HealthChecker(const Configuration& config); void SetUnhealthy(const std::string& component); + bool IsHealthy() const; + void RegisterCallback(const std::string& component, + std::function callback); + void UnregisterCallback(const std::string& component); private: friend class HealthCheckerUnittest; - bool IsHealthy() const; void CleanupForTest(); const Configuration& config_; std::set unhealthy_components_; + std::map> component_callbacks_; mutable std::mutex mutex_; }; diff --git a/src/kubernetes.cc b/src/kubernetes.cc index 5433dd30..bdb86416 100644 --- a/src/kubernetes.cc +++ b/src/kubernetes.cc @@ -796,6 +796,15 @@ void KubernetesReader::WatchMaster( if (verbose) { LOG(INFO) << "WatchMaster(" << name << "): Contacting " << endpoint; } + std::mutex last_received_mutex; + auto last_received = std::chrono::high_resolution_clock::now(); + if (health_checker_) { + health_checker_->RegisterCallback(name, [&last_received_mutex, &last_received]{ + std::lock_guard last_received_lock(last_received_mutex); + return last_received > (std::chrono::high_resolution_clock::now() - + std::chrono::minutes(5)); + }); + } try { if (verbose) { LOG(INFO) << "Locking completion mutex"; @@ -805,7 +814,11 @@ void KubernetesReader::WatchMaster( std::unique_lock watch_completion(completion_mutex); Watcher watcher( endpoint, - [=](json::value raw_watch) { + [=, &last_received_mutex, &last_received](json::value raw_watch) { + { + std::lock_guard last_received_lock(last_received_mutex); + last_received = std::chrono::high_resolution_clock::now(); + } WatchEventCallback(callback, name, std::move(raw_watch)); }, std::move(watch_completion), verbose); @@ -819,8 +832,14 @@ void KubernetesReader::WatchMaster( } } catch (const boost::system::system_error& e) { LOG(ERROR) << "Failed to query " << endpoint << ": " << e.what(); + if (health_checker_) { + health_checker_->UnregisterCallback(name); + } throw QueryException(endpoint + " -> " + e.what()); } + if (health_checker_) { + health_checker_->UnregisterCallback(name); + } } const std::string& KubernetesReader::KubernetesApiToken() const { From 90d4aa0667afa2365cd96e5b09fa13044a5257a0 Mon Sep 17 00:00:00 2001 From: "David B. Tucker" Date: Thu, 19 Jul 2018 19:52:31 -0400 Subject: [PATCH 2/7] Responded to comments. --- src/api_server.cc | 13 ++++++++++--- src/api_server.h | 5 +++-- src/configuration.cc | 8 +++++++- src/configuration.h | 5 +++++ src/health_checker.cc | 27 +++++++++++++++------------ src/health_checker.h | 31 +++++++++++++++++++++++++++---- src/kubernetes.cc | 20 +++++++------------- test/Makefile | 2 +- 8 files changed, 75 insertions(+), 36 deletions(-) diff --git a/src/api_server.cc b/src/api_server.cc index e260dae0..e048c775 100644 --- a/src/api_server.cc +++ b/src/api_server.cc @@ -68,7 +68,7 @@ void MetadataApiServer::Dispatcher::log(const HttpServer::string_type& info) con MetadataApiServer::MetadataApiServer(const Configuration& config, - HealthChecker* health_checker, + const HealthChecker* health_checker, const MetadataStore& store, int server_threads, const std::string& host, int port) @@ -150,7 +150,11 @@ void MetadataApiServer::HandleMonitoredResource( void MetadataApiServer::HandleHealthz( const HttpServer::request& request, std::shared_ptr conn) { - if (health_checker_->IsHealthy()) { + std::set unhealthy_components; + if (health_checker_ != nullptr) { + unhealthy_components = health_checker_->UnhealthyComponents(); + } + if (unhealthy_components.empty()) { if (config_.VerboseLogging()) { LOG(INFO) << "Healthz returning 200"; } @@ -167,7 +171,10 @@ void MetadataApiServer::HandleHealthz( conn->set_headers(std::map({ {"Content-Type", "text/plain"}, })); - conn->write("unhealthy"); + conn->write("unhealthy components:\n"); + for (const auto& s : unhealthy_components) { + conn->write(s + "\n"); + } } } diff --git a/src/api_server.h b/src/api_server.h index b7195ef1..687e8e8b 100644 --- a/src/api_server.h +++ b/src/api_server.h @@ -41,7 +41,8 @@ class MetadataStore; // A server that implements the metadata agent API. class MetadataApiServer { public: - MetadataApiServer(const Configuration& config, HealthChecker* health_checker, + MetadataApiServer(const Configuration& config, + const HealthChecker* health_checker, const MetadataStore& store, int server_threads, const std::string& host, int port); ~MetadataApiServer(); @@ -78,7 +79,7 @@ class MetadataApiServer { std::shared_ptr conn); const Configuration& config_; - HealthChecker* health_checker_; + const HealthChecker* health_checker_; const MetadataStore& store_; Dispatcher dispatcher_; HttpServer server_; diff --git a/src/configuration.cc b/src/configuration.cc index 41c56731..edfeb8b4 100644 --- a/src/configuration.cc +++ b/src/configuration.cc @@ -75,6 +75,7 @@ constexpr const char kDefaultInstanceId[] = ""; constexpr const char kDefaultInstanceZone[] = ""; constexpr const char kDefaultHealthCheckFile[] = "/var/run/metadata-agent/health/unhealthy"; +constexpr const int kDefaultHealthCheckWatchTimeoutSeconds = 5*60; } @@ -120,7 +121,9 @@ Configuration::Configuration() kubernetes_service_metadata_(kKubernetesDefaultServiceMetadata), instance_id_(kDefaultInstanceId), instance_zone_(kDefaultInstanceZone), - health_check_file_(kDefaultHealthCheckFile) {} + health_check_file_(kDefaultHealthCheckFile), + health_check_watch_timeout_seconds_( + kDefaultHealthCheckWatchTimeoutSeconds) {} Configuration::Configuration(std::istream& input) : Configuration() { ParseConfiguration(input); @@ -289,6 +292,9 @@ void Configuration::ParseConfiguration(std::istream& input) { config["InstanceZone"].as(instance_zone_); health_check_file_ = config["HealthCheckFile"].as(health_check_file_); + health_check_watch_timeout_seconds_ = + config["HealthCheckWatchTimeoutSeconds"].as( + health_check_watch_timeout_seconds_); } } // google diff --git a/src/configuration.h b/src/configuration.h index 2f90a3b6..9cea9ce0 100644 --- a/src/configuration.h +++ b/src/configuration.h @@ -162,6 +162,10 @@ class Configuration { std::lock_guard lock(mutex_); return health_check_file_; } + int HealthCheckWatchTimeoutSeconds() const { + std::lock_guard lock(mutex_); + return health_check_watch_timeout_seconds_; + } private: friend class ConfigurationArgumentParserTest; @@ -208,6 +212,7 @@ class Configuration { std::string instance_id_; std::string instance_zone_; std::string health_check_file_; + int health_check_watch_timeout_seconds_; }; } diff --git a/src/health_checker.cc b/src/health_checker.cc index 24d18808..d7766941 100644 --- a/src/health_checker.cc +++ b/src/health_checker.cc @@ -39,15 +39,7 @@ void HealthChecker::SetUnhealthy(const std::string& component) { bool HealthChecker::IsHealthy() const { std::lock_guard lock(mutex_); - if (!unhealthy_components_.empty()) { - return false; - } - for (auto& c : component_callbacks_) { - if (!c.second()) { - return false; - } - } - return true; + return !UnhealthyComponents().empty(); } void HealthChecker::CleanupForTest() { @@ -55,13 +47,24 @@ void HealthChecker::CleanupForTest() { config_.HealthCheckFile()).parent_path()); } -void HealthChecker::RegisterCallback(const std::string& component, - std::function callback) { +std::set HealthChecker::UnhealthyComponents() const { + std::lock_guard lock(mutex_); + std::set result(unhealthy_components_); + for (auto& c : component_callbacks_) { + if (c.second != nullptr && !c.second()) { + result.insert(c.first); + } + } + return std::move(result); +} + +void HealthChecker::RegisterComponent(const std::string& component, + std::function callback) { std::lock_guard lock(mutex_); component_callbacks_[component] = callback; } -void HealthChecker::UnregisterCallback(const std::string& component) { +void HealthChecker::UnregisterComponent(const std::string& component) { std::lock_guard lock(mutex_); component_callbacks_.erase(component); } diff --git a/src/health_checker.h b/src/health_checker.h index 93446249..360c9182 100644 --- a/src/health_checker.h +++ b/src/health_checker.h @@ -30,14 +30,15 @@ class HealthChecker { public: HealthChecker(const Configuration& config); void SetUnhealthy(const std::string& component); - bool IsHealthy() const; - void RegisterCallback(const std::string& component, - std::function callback); - void UnregisterCallback(const std::string& component); + std::set UnhealthyComponents() const; + void RegisterComponent(const std::string& component, + std::function callback); + void UnregisterComponent(const std::string& component); private: friend class HealthCheckerUnittest; + bool IsHealthy() const; void CleanupForTest(); const Configuration& config_; @@ -46,6 +47,28 @@ class HealthChecker { mutable std::mutex mutex_; }; +// Registers a component and then unregisters when it goes out of +// scope. +class ScopedHealthCheckRegistration { + public: + ScopedHealthCheckRegistration(HealthChecker* health_checker, + const std::string& component, + std::function callback) + : health_checker_(health_checker), component_(component) { + if (health_checker_ != nullptr) { + health_checker_->RegisterComponent(component_, callback); + } + } + ~ScopedHealthCheckRegistration() { + if (health_checker_ != nullptr) { + health_checker_->UnregisterComponent(component_); + } + } + private: + HealthChecker* health_checker_; + const std::string& component_; +}; + } // namespace google #endif // HEALTH_CHECKER_H_ diff --git a/src/kubernetes.cc b/src/kubernetes.cc index bdb86416..66283da9 100644 --- a/src/kubernetes.cc +++ b/src/kubernetes.cc @@ -798,13 +798,13 @@ void KubernetesReader::WatchMaster( } std::mutex last_received_mutex; auto last_received = std::chrono::high_resolution_clock::now(); - if (health_checker_) { - health_checker_->RegisterCallback(name, [&last_received_mutex, &last_received]{ - std::lock_guard last_received_lock(last_received_mutex); - return last_received > (std::chrono::high_resolution_clock::now() - - std::chrono::minutes(5)); - }); - } + auto timeout = std::chrono::seconds(config_.HealthCheckWatchTimeoutSeconds()); + ScopedHealthCheckRegistration health_check_registration( + health_checker_, name, [&last_received_mutex, &last_received, timeout]{ + std::lock_guard last_received_lock(last_received_mutex); + return last_received > (std::chrono::high_resolution_clock::now() + - timeout); + }); try { if (verbose) { LOG(INFO) << "Locking completion mutex"; @@ -832,14 +832,8 @@ void KubernetesReader::WatchMaster( } } catch (const boost::system::system_error& e) { LOG(ERROR) << "Failed to query " << endpoint << ": " << e.what(); - if (health_checker_) { - health_checker_->UnregisterCallback(name); - } throw QueryException(endpoint + " -> " + e.what()); } - if (health_checker_) { - health_checker_->UnregisterCallback(name); - } } const std::string& KubernetesReader::KubernetesApiToken() const { diff --git a/test/Makefile b/test/Makefile index 3ad7a3bc..fb8d0709 100644 --- a/test/Makefile +++ b/test/Makefile @@ -115,7 +115,7 @@ $(TESTS): $(GTEST_LIB) $(CPP_NETLIB_LIBS) $(YAML_CPP_LIBS) # Some headers need CPP_NETLIB_LIBS and YAML_CPP_LIBS. $(TESTS:%=%.o): $(GTEST_LIB) $(CPP_NETLIB_LIBS) $(YAML_CPP_LIBS) -api_server_unittest: api_server_unittest.o $(SRC_DIR)/api_server.o $(SRC_DIR)/configuration.o $(SRC_DIR)/store.o $(SRC_DIR)/json.o $(SRC_DIR)/resource.o $(SRC_DIR)/logging.o $(SRC_DIR)/time.o +api_server_unittest: api_server_unittest.o $(SRC_DIR)/api_server.o $(SRC_DIR)/configuration.o $(SRC_DIR)/store.o $(SRC_DIR)/json.o $(SRC_DIR)/resource.o $(SRC_DIR)/logging.o $(SRC_DIR)/time.o $(SRC_DIR)/health_checker.o $(CXX) $(LDFLAGS) $^ $(LDLIBS) -o $@ base64_unittest: base64_unittest.o $(SRC_DIR)/base64.o $(CXX) $(LDFLAGS) $^ $(LDLIBS) -o $@ From 6d719809ef4248a420a3bf834452ccbe6141de35 Mon Sep 17 00:00:00 2001 From: "David B. Tucker" Date: Fri, 20 Jul 2018 11:10:53 -0400 Subject: [PATCH 3/7] Address more comments. --- src/health_checker.cc | 6 +++--- src/health_checker.h | 11 +++++------ src/kubernetes.cc | 22 +++++++++++----------- 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/src/health_checker.cc b/src/health_checker.cc index d7766941..aacebadc 100644 --- a/src/health_checker.cc +++ b/src/health_checker.cc @@ -39,7 +39,7 @@ void HealthChecker::SetUnhealthy(const std::string& component) { bool HealthChecker::IsHealthy() const { std::lock_guard lock(mutex_); - return !UnhealthyComponents().empty(); + return unhealthy_components_.empty(); } void HealthChecker::CleanupForTest() { @@ -49,13 +49,13 @@ void HealthChecker::CleanupForTest() { std::set HealthChecker::UnhealthyComponents() const { std::lock_guard lock(mutex_); - std::set result(unhealthy_components_); + std::set result; for (auto& c : component_callbacks_) { if (c.second != nullptr && !c.second()) { result.insert(c.first); } } - return std::move(result); + return result; } void HealthChecker::RegisterComponent(const std::string& component, diff --git a/src/health_checker.h b/src/health_checker.h index 360c9182..ade8401d 100644 --- a/src/health_checker.h +++ b/src/health_checker.h @@ -49,24 +49,23 @@ class HealthChecker { // Registers a component and then unregisters when it goes out of // scope. -class ScopedHealthCheckRegistration { +class CheckHealth { public: - ScopedHealthCheckRegistration(HealthChecker* health_checker, - const std::string& component, - std::function callback) + CheckHealth(HealthChecker* health_checker, const std::string& component, + std::function callback) : health_checker_(health_checker), component_(component) { if (health_checker_ != nullptr) { health_checker_->RegisterComponent(component_, callback); } } - ~ScopedHealthCheckRegistration() { + ~CheckHealth() { if (health_checker_ != nullptr) { health_checker_->UnregisterComponent(component_); } } private: HealthChecker* health_checker_; - const std::string& component_; + const std::string component_; }; } // namespace google diff --git a/src/kubernetes.cc b/src/kubernetes.cc index 66283da9..b15e7725 100644 --- a/src/kubernetes.cc +++ b/src/kubernetes.cc @@ -796,14 +796,13 @@ void KubernetesReader::WatchMaster( if (verbose) { LOG(INFO) << "WatchMaster(" << name << "): Contacting " << endpoint; } - std::mutex last_received_mutex; - auto last_received = std::chrono::high_resolution_clock::now(); - auto timeout = std::chrono::seconds(config_.HealthCheckWatchTimeoutSeconds()); - ScopedHealthCheckRegistration health_check_registration( - health_checker_, name, [&last_received_mutex, &last_received, timeout]{ - std::lock_guard last_received_lock(last_received_mutex); - return last_received > (std::chrono::high_resolution_clock::now() - - timeout); + const int timeout = config_.HealthCheckWatchTimeoutSeconds(); + std::mutex expiration_mutex; + auto expiration = std::chrono::steady_clock::now() + time::seconds(timeout); + CheckHealth check_health( + health_checker_, name, [&expiration_mutex, &expiration]{ + std::lock_guard expiration_lock(expiration_mutex); + return std::chrono::high_resolution_clock::now() < expiration; }); try { if (verbose) { @@ -814,10 +813,11 @@ void KubernetesReader::WatchMaster( std::unique_lock watch_completion(completion_mutex); Watcher watcher( endpoint, - [=, &last_received_mutex, &last_received](json::value raw_watch) { + [=, &expiration_mutex, &expiration](json::value raw_watch) { { - std::lock_guard last_received_lock(last_received_mutex); - last_received = std::chrono::high_resolution_clock::now(); + std::lock_guard expiration_lock(expiration_mutex); + expiration = std::chrono::steady_clock::now() + + time::seconds(timeout); } WatchEventCallback(callback, name, std::move(raw_watch)); }, From 861dd08b6c130485701ce3410e76475dab478ced Mon Sep 17 00:00:00 2001 From: "David B. Tucker" Date: Fri, 20 Jul 2018 12:18:11 -0400 Subject: [PATCH 4/7] More comments. --- src/api_server.cc | 8 ++++---- src/configuration.cc | 12 ++++++------ src/configuration.h | 6 +++--- src/kubernetes.cc | 17 ++++++++++++----- 4 files changed, 25 insertions(+), 18 deletions(-) diff --git a/src/api_server.cc b/src/api_server.cc index e048c775..72b5c9d5 100644 --- a/src/api_server.cc +++ b/src/api_server.cc @@ -156,7 +156,7 @@ void MetadataApiServer::HandleHealthz( } if (unhealthy_components.empty()) { if (config_.VerboseLogging()) { - LOG(INFO) << "Healthz returning 200"; + LOG(INFO) << "/healthz returning 200"; } conn->set_status(HttpServer::connection::ok); conn->set_headers(std::map({ @@ -165,15 +165,15 @@ void MetadataApiServer::HandleHealthz( conn->write("healthy"); } else { if (config_.VerboseLogging()) { - LOG(WARNING) << "Healthz returning 500"; + LOG(WARNING) << "/healthz returning 500"; } conn->set_status(HttpServer::connection::internal_server_error); conn->set_headers(std::map({ {"Content-Type", "text/plain"}, })); conn->write("unhealthy components:\n"); - for (const auto& s : unhealthy_components) { - conn->write(s + "\n"); + for (const auto& component : unhealthy_components) { + conn->write(component + "\n"); } } } diff --git a/src/configuration.cc b/src/configuration.cc index edfeb8b4..e6f37a54 100644 --- a/src/configuration.cc +++ b/src/configuration.cc @@ -75,7 +75,7 @@ constexpr const char kDefaultInstanceId[] = ""; constexpr const char kDefaultInstanceZone[] = ""; constexpr const char kDefaultHealthCheckFile[] = "/var/run/metadata-agent/health/unhealthy"; -constexpr const int kDefaultHealthCheckWatchTimeoutSeconds = 5*60; +constexpr const int kDefaultHealthCheckMaxDataAgeSeconds = 5*60; } @@ -122,8 +122,8 @@ Configuration::Configuration() instance_id_(kDefaultInstanceId), instance_zone_(kDefaultInstanceZone), health_check_file_(kDefaultHealthCheckFile), - health_check_watch_timeout_seconds_( - kDefaultHealthCheckWatchTimeoutSeconds) {} + health_check_max_data_age_seconds_( + kDefaultHealthCheckMaxDataAgeSeconds) {} Configuration::Configuration(std::istream& input) : Configuration() { ParseConfiguration(input); @@ -292,9 +292,9 @@ void Configuration::ParseConfiguration(std::istream& input) { config["InstanceZone"].as(instance_zone_); health_check_file_ = config["HealthCheckFile"].as(health_check_file_); - health_check_watch_timeout_seconds_ = - config["HealthCheckWatchTimeoutSeconds"].as( - health_check_watch_timeout_seconds_); + health_check_max_data_age_seconds_ = + config["HealthCheckMaxDataAgeSeconds"].as( + health_check_max_data_age_seconds_); } } // google diff --git a/src/configuration.h b/src/configuration.h index 9cea9ce0..099d767b 100644 --- a/src/configuration.h +++ b/src/configuration.h @@ -162,9 +162,9 @@ class Configuration { std::lock_guard lock(mutex_); return health_check_file_; } - int HealthCheckWatchTimeoutSeconds() const { + int HealthCheckMaxDataAgeSeconds() const { std::lock_guard lock(mutex_); - return health_check_watch_timeout_seconds_; + return health_check_max_data_age_seconds_; } private: @@ -212,7 +212,7 @@ class Configuration { std::string instance_id_; std::string instance_zone_; std::string health_check_file_; - int health_check_watch_timeout_seconds_; + int health_check_max_data_age_seconds_; }; } diff --git a/src/kubernetes.cc b/src/kubernetes.cc index b15e7725..f20f40ca 100644 --- a/src/kubernetes.cc +++ b/src/kubernetes.cc @@ -796,13 +796,20 @@ void KubernetesReader::WatchMaster( if (verbose) { LOG(INFO) << "WatchMaster(" << name << "): Contacting " << endpoint; } - const int timeout = config_.HealthCheckWatchTimeoutSeconds(); + // Initialize the expiration time. This is the time by when the + // watcher has to receive some data to be considered healthy. Each + // receipt of a new message bumps the expiration forward. + // + // TODO: Cache the expiration (or timestamp of last receipt) in the + // store instead of here. + const int max_data_age = config_.HealthCheckMaxDataAgeSeconds(); std::mutex expiration_mutex; - auto expiration = std::chrono::steady_clock::now() + time::seconds(timeout); + auto expiration = + std::chrono::steady_clock::now() + time::seconds(max_data_age); CheckHealth check_health( health_checker_, name, [&expiration_mutex, &expiration]{ std::lock_guard expiration_lock(expiration_mutex); - return std::chrono::high_resolution_clock::now() < expiration; + return std::chrono::steady_clock::now() < expiration; }); try { if (verbose) { @@ -816,8 +823,8 @@ void KubernetesReader::WatchMaster( [=, &expiration_mutex, &expiration](json::value raw_watch) { { std::lock_guard expiration_lock(expiration_mutex); - expiration = std::chrono::steady_clock::now() + - time::seconds(timeout); + expiration = + std::chrono::steady_clock::now() + time::seconds(max_data_age); } WatchEventCallback(callback, name, std::move(raw_watch)); }, From ad5734d2784703381491d1003147f9ee4f5e3499 Mon Sep 17 00:00:00 2001 From: "David B. Tucker" Date: Sun, 31 Dec 2017 19:04:22 -0500 Subject: [PATCH 5/7] Always log warning for /healthz failure. --- src/api_server.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/api_server.cc b/src/api_server.cc index 72b5c9d5..57c937de 100644 --- a/src/api_server.cc +++ b/src/api_server.cc @@ -164,9 +164,7 @@ void MetadataApiServer::HandleHealthz( })); conn->write("healthy"); } else { - if (config_.VerboseLogging()) { - LOG(WARNING) << "/healthz returning 500"; - } + LOG(WARNING) << "/healthz returning 500"; conn->set_status(HttpServer::connection::internal_server_error); conn->set_headers(std::map({ {"Content-Type", "text/plain"}, From a7eec9b9fd151e9c2e065c311420404e3930830b Mon Sep 17 00:00:00 2001 From: "David B. Tucker" Date: Fri, 20 Jul 2018 16:19:52 -0400 Subject: [PATCH 6/7] Include unhealthy components in set. --- src/health_checker.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/health_checker.cc b/src/health_checker.cc index aacebadc..9f5adb90 100644 --- a/src/health_checker.cc +++ b/src/health_checker.cc @@ -49,7 +49,7 @@ void HealthChecker::CleanupForTest() { std::set HealthChecker::UnhealthyComponents() const { std::lock_guard lock(mutex_); - std::set result; + std::set result(unhealthy_components_); for (auto& c : component_callbacks_) { if (c.second != nullptr && !c.second()) { result.insert(c.first); From 384a8062d83637e0e1b185f99bb40d82bc3b91ec Mon Sep 17 00:00:00 2001 From: "David B. Tucker" Date: Fri, 20 Jul 2018 18:06:29 -0400 Subject: [PATCH 7/7] Log unhealthy components. --- src/api_server.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/api_server.cc b/src/api_server.cc index 57c937de..275eddaf 100644 --- a/src/api_server.cc +++ b/src/api_server.cc @@ -164,7 +164,8 @@ void MetadataApiServer::HandleHealthz( })); conn->write("healthy"); } else { - LOG(WARNING) << "/healthz returning 500"; + LOG(WARNING) << "/healthz returning 500; unhealthy components: " + << boost::algorithm::join(unhealthy_components, ", "); conn->set_status(HttpServer::connection::internal_server_error); conn->set_headers(std::map({ {"Content-Type", "text/plain"},