From cba2cede278c07a3f65775f5ef8be1aaad67c7d3 Mon Sep 17 00:00:00 2001 From: Shriram Rajagopalan Date: Tue, 7 Mar 2017 10:41:10 -0500 Subject: [PATCH 1/7] outlier detection config --- source/common/json/config_schemas.cc | 32 ++++++++++++- .../common/upstream/outlier_detection_impl.cc | 47 ++++++++++++------- .../common/upstream/outlier_detection_impl.h | 33 +++++++++++-- .../upstream/outlier_detection_impl_test.cc | 21 +++++++++ 4 files changed, 109 insertions(+), 24 deletions(-) diff --git a/source/common/json/config_schemas.cc b/source/common/json/config_schemas.cc index 1c0bce68d0448..3e7ac2af15f34 100644 --- a/source/common/json/config_schemas.cc +++ b/source/common/json/config_schemas.cc @@ -1061,7 +1061,37 @@ const std::string Json::Schema::CLUSTER_SCHEMA(R"EOF( "minimum" : 0, "exclusiveMinimum" : true }, - "outlier_detection" : {"type" : "object"} + "outlier_detection" : { + "type" : "object", + "properties" : { + "consecutive_5xx" : { + "type" : "integer", + "minimum" : 0, + "exclusiveMinimum" : true + }, + "interval_ms" : { + "type" : "integer", + "minimum" : 0, + "exclusiveMinimum" : true + }, + "base_ejection_time_ms" : { + "type" : "integer", + "minimum" : 0, + "exclusiveMinimum" : true + }, + "max_ejection_percent" : { + "type" : "integer", + "minimum" : 0, + "maximum" : 100 + }, + "enforcing" : { + "type" : "integer", + "minimum" : 0, + "maximum" : 100 + } + }, + "additionalProperties" : false + } }, "required" : ["name", "type", "connect_timeout_ms", "lb_type"], "additionalProperties" : false diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 178803e70bfc6..2096a93632887 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -17,8 +17,8 @@ DetectorPtr DetectorImplFactory::createForCluster(Cluster& cluster, // Right now we don't support any configuration but in order to make the config backwards // compatible we just look for an empty object. if (cluster_config.hasObject("outlier_detection")) { - return DetectorImpl::create(cluster, dispatcher, runtime, ProdSystemTimeSource::instance_, - event_logger); + return DetectorImpl::create(cluster, cluster_config.getObject("outlier_detection", true), + dispatcher, runtime, ProdSystemTimeSource::instance_, event_logger); } else { return nullptr; } @@ -44,7 +44,8 @@ void DetectorHostSinkImpl::putHttpResponseCode(uint64_t response_code) { } if (++consecutive_5xx_ == - detector->runtime().snapshot().getInteger("outlier_detection.consecutive_5xx", 5)) { + detector->runtime().snapshot().getInteger("outlier_detection.consecutive_5xx", + detector->config().consecutive5xx())) { detector->onConsecutive5xx(host_.lock()); } } else { @@ -52,10 +53,19 @@ void DetectorHostSinkImpl::putHttpResponseCode(uint64_t response_code) { } } -DetectorImpl::DetectorImpl(const Cluster& cluster, Event::Dispatcher& dispatcher, - Runtime::Loader& runtime, SystemTimeSource& time_source, - EventLoggerPtr event_logger) - : dispatcher_(dispatcher), runtime_(runtime), time_source_(time_source), +DetectorConfig::DetectorConfig(const Json::ObjectPtr json_config) { + interval_ms_ = static_cast json_config->getInteger("interval_ms", 10000); + base_ejection_time_ms_ = + static_cast json_config->getInteger("base_ejection_time_ms", 30000); + consecutive_5xx_ = static_cast json_config->getInteger("consecutive_5xx", 5); + max_ejection_percent_ = static_cast json_config->getInteger("max_ejection_percent", 10); + enforcing_ = static_cast json_config->getInteger("enforcing", 100); +} + +DetectorImpl::DetectorImpl(const Cluster& cluster, const Json::ObjectPtr json_config, + Event::Dispatcher& dispatcher, Runtime::Loader& runtime, + SystemTimeSource& time_source, EventLoggerPtr event_logger) + : config_(json_config), dispatcher_(dispatcher), runtime_(runtime), time_source_(time_source), stats_(generateStats(cluster.info()->statsScope())), interval_timer_(dispatcher.createTimer([this]() -> void { onIntervalTimer(); })), event_logger_(event_logger) {} @@ -69,13 +79,12 @@ DetectorImpl::~DetectorImpl() { } } -std::shared_ptr DetectorImpl::create(const Cluster& cluster, - Event::Dispatcher& dispatcher, - Runtime::Loader& runtime, - SystemTimeSource& time_source, - EventLoggerPtr event_logger) { +std::shared_ptr +DetectorImpl::create(const Cluster& cluster, const Json::ObjectPtr json_config, + Event::Dispatcher& dispatcher, Runtime::Loader& runtime, + SystemTimeSource& time_source, EventLoggerPtr event_logger) { std::shared_ptr detector( - new DetectorImpl(cluster, dispatcher, runtime, time_source, event_logger)); + new DetectorImpl(cluster, json_config, dispatcher, runtime, time_source, event_logger)); detector->initialize(cluster); return detector; } @@ -114,7 +123,7 @@ void DetectorImpl::addHostSink(HostPtr host) { void DetectorImpl::armIntervalTimer() { interval_timer_->enableTimer(std::chrono::milliseconds( - runtime_.snapshot().getInteger("outlier_detection.interval_ms", 10000))); + runtime_.snapshot().getInteger("outlier_detection.interval_ms", config_.intervalMs()))); } void DetectorImpl::checkHostForUneject(HostPtr host, DetectorHostSinkImpl* sink, SystemTime now) { @@ -122,8 +131,9 @@ void DetectorImpl::checkHostForUneject(HostPtr host, DetectorHostSinkImpl* sink, return; } - std::chrono::milliseconds base_eject_time = std::chrono::milliseconds( - runtime_.snapshot().getInteger("outlier_detection.base_ejection_time_ms", 30000)); + std::chrono::milliseconds base_eject_time = + std::chrono::milliseconds(runtime_.snapshot().getInteger( + "outlier_detection.base_ejection_time_ms", config_.baseEjectionTimeMs())); ASSERT(sink->numEjections() > 0) if ((base_eject_time * sink->numEjections()) <= (now - sink->lastEjectionTime().value())) { stats_.ejections_active_.dec(); @@ -139,11 +149,12 @@ void DetectorImpl::checkHostForUneject(HostPtr host, DetectorHostSinkImpl* sink, void DetectorImpl::ejectHost(HostPtr host, EjectionType type) { uint64_t max_ejection_percent = std::min( - 100, runtime_.snapshot().getInteger("outlier_detection.max_ejection_percent", 10)); + 100, runtime_.snapshot().getInteger("outlier_detection.max_ejection_percent", + config_.maxEjectionPercent())); double ejected_percent = 100.0 * stats_.ejections_active_.value() / host_sinks_.size(); if (ejected_percent < max_ejection_percent) { stats_.ejections_total_.inc(); - if (runtime_.snapshot().featureEnabled("outlier_detection.enforcing", 100)) { + if (runtime_.snapshot().featureEnabled("outlier_detection.enforcing", config_.enforcing())) { stats_.ejections_active_.inc(); host_sinks_[host]->eject(time_source_.currentSystemTime()); runCallbacks(host); diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 64d38086e6842..ec187cd35b242 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -84,6 +84,27 @@ struct DetectionStats { ALL_OUTLIER_DETECTION_STATS(GENERATE_COUNTER_STRUCT, GENERATE_GAUGE_STRUCT) }; +/** + * Configuration for the outlier detection. + */ +class DetectorConfig { +public: + DetectorConfig(const Json::ObjectPtr json_config); + + uint64_t intervalMs() { return interval_ms_; } + uint64_t baseEjectionTimeMs() { return base_ejection_time_ms_; } + uint64_t consecutive5xx() { return consecutive_5xx_; } + uint64_t maxEjectionPercent() { return max_ejection_percent_; } + uint64_t enforcing() { return enforcing_; } + +private: + uint64_t interval_ms_{}; + uint64_t base_ejection_time_ms_{}; + uint64_t consecutive_5xx_{}; + uint64_t max_ejection_percent_{}; + uint64_t enforcing_{}; +}; + /** * An implementation of an outlier detector. In the future we may support multiple outlier detection * implementations with different configuration. For now, as we iterate everything is contained @@ -91,20 +112,21 @@ struct DetectionStats { */ class DetectorImpl : public Detector, public std::enable_shared_from_this { public: - static std::shared_ptr create(const Cluster& cluster, Event::Dispatcher& dispatcher, - Runtime::Loader& runtime, - SystemTimeSource& time_source, - EventLoggerPtr event_logger); + static std::shared_ptr + create(const Cluster& cluster, const Json::ObjectPtr& json_config, Event::Dispatcher& dispatcher, + Runtime::Loader& runtime, SystemTimeSource& time_source, EventLoggerPtr event_logger); ~DetectorImpl(); void onConsecutive5xx(HostPtr host); Runtime::Loader& runtime() { return runtime_; } + DetectorConfig& config() { return config_; } // Upstream::Outlier::Detector void addChangedStateCb(ChangeStateCb cb) override { callbacks_.push_back(cb); } private: - DetectorImpl(const Cluster& cluster, Event::Dispatcher& dispatcher, Runtime::Loader& runtime, + DetectorImpl(const Cluster& cluster, const Json::Object& json_config, + Event::Dispatcher& dispatcher, Runtime::Loader& runtime, SystemTimeSource& time_source, EventLoggerPtr event_logger); void addHostSink(HostPtr host); @@ -117,6 +139,7 @@ class DetectorImpl : public Detector, public std::enable_shared_from_this cluster; + NiceMock dispatcher; + NiceMock runtime; + EXPECT_NE(nullptr, + DetectorImplFactory::createForCluster(cluster, *loader, dispatcher, runtime, nullptr)); +} + class CallbackChecker { public: MOCK_METHOD1(check, void(HostPtr host)); From 385e0c352386a59d70956c3576642dcefa261004 Mon Sep 17 00:00:00 2001 From: Shriram Rajagopalan Date: Tue, 7 Mar 2017 11:28:10 -0500 Subject: [PATCH 2/7] compilation fixes --- .../common/upstream/outlier_detection_impl.cc | 17 ++++++++------- .../common/upstream/outlier_detection_impl.h | 4 ++-- .../upstream/outlier_detection_impl_test.cc | 21 ++++++++++--------- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 2096a93632887..bdd259d6aa97d 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -53,16 +53,17 @@ void DetectorHostSinkImpl::putHttpResponseCode(uint64_t response_code) { } } -DetectorConfig::DetectorConfig(const Json::ObjectPtr json_config) { - interval_ms_ = static_cast json_config->getInteger("interval_ms", 10000); +DetectorConfig::DetectorConfig(const Json::ObjectPtr& json_config) { + interval_ms_ = static_cast(json_config->getInteger("interval_ms", 10000)); base_ejection_time_ms_ = - static_cast json_config->getInteger("base_ejection_time_ms", 30000); - consecutive_5xx_ = static_cast json_config->getInteger("consecutive_5xx", 5); - max_ejection_percent_ = static_cast json_config->getInteger("max_ejection_percent", 10); - enforcing_ = static_cast json_config->getInteger("enforcing", 100); + static_cast(json_config->getInteger("base_ejection_time_ms", 30000)); + consecutive_5xx_ = static_cast(json_config->getInteger("consecutive_5xx", 5)); + max_ejection_percent_ = + static_cast(json_config->getInteger("max_ejection_percent", 10)); + enforcing_ = static_cast(json_config->getInteger("enforcing", 100)); } -DetectorImpl::DetectorImpl(const Cluster& cluster, const Json::ObjectPtr json_config, +DetectorImpl::DetectorImpl(const Cluster& cluster, const Json::ObjectPtr& json_config, Event::Dispatcher& dispatcher, Runtime::Loader& runtime, SystemTimeSource& time_source, EventLoggerPtr event_logger) : config_(json_config), dispatcher_(dispatcher), runtime_(runtime), time_source_(time_source), @@ -80,7 +81,7 @@ DetectorImpl::~DetectorImpl() { } std::shared_ptr -DetectorImpl::create(const Cluster& cluster, const Json::ObjectPtr json_config, +DetectorImpl::create(const Cluster& cluster, const Json::ObjectPtr& json_config, Event::Dispatcher& dispatcher, Runtime::Loader& runtime, SystemTimeSource& time_source, EventLoggerPtr event_logger) { std::shared_ptr detector( diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index ec187cd35b242..5662e0f60b094 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -89,7 +89,7 @@ struct DetectionStats { */ class DetectorConfig { public: - DetectorConfig(const Json::ObjectPtr json_config); + DetectorConfig(const Json::ObjectPtr& json_config); uint64_t intervalMs() { return interval_ms_; } uint64_t baseEjectionTimeMs() { return base_ejection_time_ms_; } @@ -125,7 +125,7 @@ class DetectorImpl : public Detector, public std::enable_shared_from_this event_logger_{new MockEventLogger()}; + Json::ObjectPtr loader_ = Json::Factory::LoadFromString("{}"); }; TEST_F(OutlierDetectorImplTest, DestroyWithActive) { @@ -91,7 +92,7 @@ TEST_F(OutlierDetectorImplTest, DestroyWithActive) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); @@ -121,7 +122,7 @@ TEST_F(OutlierDetectorImplTest, DestroyHostInUse) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); detector.reset(); @@ -139,7 +140,7 @@ TEST_F(OutlierDetectorImplTest, BasicFlow) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); cluster_.hosts_.push_back(HostPtr{new HostImpl( @@ -196,7 +197,7 @@ TEST_F(OutlierDetectorImplTest, RemoveWhileEjected) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); @@ -235,7 +236,7 @@ TEST_F(OutlierDetectorImplTest, Overflow) { false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); ON_CALL(runtime_.snapshot_, getInteger("outlier_detection.max_ejection_percent", _)) @@ -273,7 +274,7 @@ TEST_F(OutlierDetectorImplTest, NotEnforcing) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); @@ -298,7 +299,7 @@ TEST_F(OutlierDetectorImplTest, CrossThreadRemoveRace) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); @@ -324,7 +325,7 @@ TEST_F(OutlierDetectorImplTest, CrossThreadDestroyRace) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); @@ -351,7 +352,7 @@ TEST_F(OutlierDetectorImplTest, CrossThreadFailRace) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); @@ -384,7 +385,7 @@ TEST_F(OutlierDetectorImplTest, Consecutive5xxAlreadyEjected) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); // Cause a consecutive 5xx error. From 35dbd98f18dbfd566218ce0af280da505cb8a69a Mon Sep 17 00:00:00 2001 From: Shriram Rajagopalan Date: Tue, 7 Mar 2017 13:19:38 -0500 Subject: [PATCH 3/7] docs --- .../configuration/cluster_manager/cluster.rst | 27 ++++++++++++++++--- .../cluster_manager/cluster_runtime.rst | 12 ++------- docs/intro/arch_overview/outlier.rst | 8 +++--- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/docs/configuration/cluster_manager/cluster.rst b/docs/configuration/cluster_manager/cluster.rst index a12168fff87ac..1054ce8588f5c 100644 --- a/docs/configuration/cluster_manager/cluster.rst +++ b/docs/configuration/cluster_manager/cluster.rst @@ -138,9 +138,30 @@ dns_refresh_rate_ms outlier_detection *(optional, object)* If specified, outlier detection will be enabled for this upstream cluster. - Currently the presence of the empty object enables it and there are no options. See the - :ref:`architecture overview ` for more information on outlier - detection. + See the :ref:`architecture overview ` for more information on outlier + detection. The following configuration values are supported: + + consecutive_5xx + The number of consecutive 5xx responses before a consecutive 5xx ejection occurs. Defaults to 5. + + interval_ms + The time interval between ejection analysis sweeps. This can result in both new ejections as well + as hosts being returned to service. Defaults to 10000ms or 10s. + + base_ejection_time_ms + The base time that a host is ejected for. The real time is equal to the base time multiplied by + the number of times the host has been ejected. Defaults to 30000ms or 30s. + + max_ejection_percent + The maximum % of an upstream cluster that can be ejected due to outlier detection. Defaults to 10%. + + enforcing + The % chance that a host will be actually ejected when an outlier status is detected. This setting + can be used to disable ejection or to ramp it up slowly. Defaults to 100. + +Each of the above configuration values can be overridden via +:ref:`runtime values `. + .. toctree:: :hidden: diff --git a/docs/configuration/cluster_manager/cluster_runtime.rst b/docs/configuration/cluster_manager/cluster_runtime.rst index a64a1e3c709a1..e2b81648a2e44 100644 --- a/docs/configuration/cluster_manager/cluster_runtime.rst +++ b/docs/configuration/cluster_manager/cluster_runtime.rst @@ -29,26 +29,18 @@ Outlier detection ----------------- See the outlier detection :ref:`architecture overview ` for more -information on outlier detection. +information on outlier detection. The runtime parameters supported by outlier detection are the +same as the :ref:`static configuration parameters `, namely outlier_detection.consecutive_5xx - The number of consecutive 5xx responses before a consecutive 5xx ejection occurs. Defaults to 5. outlier_detection.interval_ms - The time interval between ejection analysis sweeps. This can result in both new ejections as well - as hosts being returned to service. Defaults to 10000ms or 10s. outlier_detection.base_ejection_time_ms - The base time that a host is ejected for. The real time is equal to the base time multiplied by - the number of times the host has been ejected. Defaults to 30000ms or 30s. outlier_detection.max_ejection_percent - The maximum % of an upstream cluster that can be ejected due to outlier detection. Defaults to - 10%. outlier_detection.enforcing - The % chance that a host will be actually ejected when an outlier status is detected. This setting - can be used to disable ejection or to ramp it up slowly. Defaults to 100. Core ---- diff --git a/docs/intro/arch_overview/outlier.rst b/docs/intro/arch_overview/outlier.rst index 865e5cde58d4d..649094b55e371 100644 --- a/docs/intro/arch_overview/outlier.rst +++ b/docs/intro/arch_overview/outlier.rst @@ -20,14 +20,14 @@ ejection algorithm works as follows: #. A host is determined to be an outlier. #. Envoy checks to make sure the number of ejected hosts is below the allowed threshold (specified - via the :ref:`outlier_detection.max_ejection_percent - ` runtime value). + via the :ref:`outlier_detection.max_ejection_percent + ` value. If the number of ejected hosts is above the threshold the host is not ejected. #. The host is ejected for some number of milliseconds. Ejection means that the host is marked unhealthy and will not be used during load balancing unless the load balancer is in a :ref:`panic ` scenario. The number of milliseconds is equal to the :ref:`outlier_detection.base_ejection_time_ms - ` runtime value + ` value multiplied by the number of times the host has been ejected. This causes hosts to get ejected for longer and longer periods if they continue to fail. #. An ejected host will automatically be brought back into service after the ejection time has @@ -46,7 +46,7 @@ If an upstream host returns some number of consecutive 5xx, it will be ejected. case a 5xx means an actual 5xx respond code, or an event that would cause the HTTP router to return one on the upstream's behalf (reset, connection failure, etc.). The number of consecutive 5xx required for ejection is controlled by the :ref:`outlier_detection.consecutive_5xx -` runtime value. +` value. Ejection event logging ---------------------- From 15e7b4860cb23e2a317c9617152b1b057dd29485 Mon Sep 17 00:00:00 2001 From: Shriram Rajagopalan Date: Tue, 7 Mar 2017 13:45:38 -0500 Subject: [PATCH 4/7] nits --- docs/configuration/cluster_manager/cluster.rst | 14 ++++++++++++-- .../cluster_manager/cluster_runtime.rst | 15 +++++++++++++++ docs/intro/arch_overview/outlier.rst | 2 +- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/docs/configuration/cluster_manager/cluster.rst b/docs/configuration/cluster_manager/cluster.rst index 1054ce8588f5c..e0337439b8256 100644 --- a/docs/configuration/cluster_manager/cluster.rst +++ b/docs/configuration/cluster_manager/cluster.rst @@ -141,26 +141,36 @@ outlier_detection See the :ref:`architecture overview ` for more information on outlier detection. The following configuration values are supported: + .. _config_cluster_manager_cluster_outlier_detection_consecutive_5xx: + consecutive_5xx The number of consecutive 5xx responses before a consecutive 5xx ejection occurs. Defaults to 5. + .. _config_cluster_manager_cluster_outlier_detection_interval_ms: + interval_ms The time interval between ejection analysis sweeps. This can result in both new ejections as well as hosts being returned to service. Defaults to 10000ms or 10s. + .. _config_cluster_manager_cluster_outlier_detection_base_ejection_time_ms: + base_ejection_time_ms The base time that a host is ejected for. The real time is equal to the base time multiplied by the number of times the host has been ejected. Defaults to 30000ms or 30s. + .. _config_cluster_manager_cluster_outlier_detection_max_ejection_percent: + max_ejection_percent The maximum % of an upstream cluster that can be ejected due to outlier detection. Defaults to 10%. + .. _config_cluster_manager_cluster_outlier_detection_enforcing: + enforcing The % chance that a host will be actually ejected when an outlier status is detected. This setting can be used to disable ejection or to ramp it up slowly. Defaults to 100. -Each of the above configuration values can be overridden via -:ref:`runtime values `. + Each of the above configuration values can be overridden via + :ref:`runtime values `. .. toctree:: diff --git a/docs/configuration/cluster_manager/cluster_runtime.rst b/docs/configuration/cluster_manager/cluster_runtime.rst index e2b81648a2e44..427fe57da3fc5 100644 --- a/docs/configuration/cluster_manager/cluster_runtime.rst +++ b/docs/configuration/cluster_manager/cluster_runtime.rst @@ -33,14 +33,29 @@ information on outlier detection. The runtime parameters supported by outlier de same as the :ref:`static configuration parameters `, namely outlier_detection.consecutive_5xx + :ref:`consecutive_5XX + ` + setting in outlier detection outlier_detection.interval_ms + :ref:`interval_ms + ` + setting in outlier detection outlier_detection.base_ejection_time_ms + :ref:`base_ejection_time_ms + ` + setting in outlier detection outlier_detection.max_ejection_percent + :ref:`max_ejection_percent + ` + setting in outlier detection outlier_detection.enforcing + :ref:`enforcing + ` + setting in outlier detection Core ---- diff --git a/docs/intro/arch_overview/outlier.rst b/docs/intro/arch_overview/outlier.rst index 649094b55e371..ee333648e6550 100644 --- a/docs/intro/arch_overview/outlier.rst +++ b/docs/intro/arch_overview/outlier.rst @@ -21,7 +21,7 @@ ejection algorithm works as follows: #. A host is determined to be an outlier. #. Envoy checks to make sure the number of ejected hosts is below the allowed threshold (specified via the :ref:`outlier_detection.max_ejection_percent - ` value. + ` setting). If the number of ejected hosts is above the threshold the host is not ejected. #. The host is ejected for some number of milliseconds. Ejection means that the host is marked unhealthy and will not be used during load balancing unless the load balancer is in a From 28ee76afb3b962d62649f1b2330bf4a7a4dcce03 Mon Sep 17 00:00:00 2001 From: Shriram Rajagopalan Date: Tue, 7 Mar 2017 18:43:52 -0500 Subject: [PATCH 5/7] address PR comments --- .../common/upstream/outlier_detection_impl.cc | 21 ++++----- .../common/upstream/outlier_detection_impl.h | 10 ++--- .../upstream/outlier_detection_impl_test.cc | 43 ++++++++++--------- 3 files changed, 36 insertions(+), 38 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index bdd259d6aa97d..620fa8357c5ed 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -14,10 +14,8 @@ DetectorPtr DetectorImplFactory::createForCluster(Cluster& cluster, Event::Dispatcher& dispatcher, Runtime::Loader& runtime, EventLoggerPtr event_logger) { - // Right now we don't support any configuration but in order to make the config backwards - // compatible we just look for an empty object. if (cluster_config.hasObject("outlier_detection")) { - return DetectorImpl::create(cluster, cluster_config.getObject("outlier_detection", true), + return DetectorImpl::create(cluster, cluster_config.getObject("outlier_detection", false), dispatcher, runtime, ProdSystemTimeSource::instance_, event_logger); } else { return nullptr; @@ -53,15 +51,14 @@ void DetectorHostSinkImpl::putHttpResponseCode(uint64_t response_code) { } } -DetectorConfig::DetectorConfig(const Json::ObjectPtr& json_config) { - interval_ms_ = static_cast(json_config->getInteger("interval_ms", 10000)); - base_ejection_time_ms_ = - static_cast(json_config->getInteger("base_ejection_time_ms", 30000)); - consecutive_5xx_ = static_cast(json_config->getInteger("consecutive_5xx", 5)); - max_ejection_percent_ = - static_cast(json_config->getInteger("max_ejection_percent", 10)); - enforcing_ = static_cast(json_config->getInteger("enforcing", 100)); -} +DetectorConfig::DetectorConfig(const Json::ObjectPtr& json_config) + : interval_ms_(static_cast(json_config->getInteger("interval_ms", 10000))), + base_ejection_time_ms_( + static_cast(json_config->getInteger("base_ejection_time_ms", 30000))), + consecutive_5xx_(static_cast(json_config->getInteger("consecutive_5xx", 5))), + max_ejection_percent_( + static_cast(json_config->getInteger("max_ejection_percent", 10))), + enforcing_(static_cast(json_config->getInteger("enforcing", 100))) {} DetectorImpl::DetectorImpl(const Cluster& cluster, const Json::ObjectPtr& json_config, Event::Dispatcher& dispatcher, Runtime::Loader& runtime, diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 5662e0f60b094..17f669e2f1861 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -98,11 +98,11 @@ class DetectorConfig { uint64_t enforcing() { return enforcing_; } private: - uint64_t interval_ms_{}; - uint64_t base_ejection_time_ms_{}; - uint64_t consecutive_5xx_{}; - uint64_t max_ejection_percent_{}; - uint64_t enforcing_{}; + const uint64_t interval_ms_{}; + const uint64_t base_ejection_time_ms_{}; + const uint64_t consecutive_5xx_{}; + const uint64_t max_ejection_percent_{}; + const uint64_t enforcing_{}; }; /** diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index fe56cd1afe2df..3012fcf273f39 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -43,27 +43,6 @@ TEST(OutlierDetectorImplFactoryTest, Detector) { DetectorImplFactory::createForCluster(cluster, *loader, dispatcher, runtime, nullptr)); } -TEST(OutlierDetectorImplFactoryTest, DetectorConfig) { - std::string json = R"EOF( - { - "outlier_detection": { - "interval_ms" : 100, - "base_eject_time_ms" : 10000, - "consecutive_5xx" : 10, - "max_ejection_percent" : 50, - "enforcing" : 10 - } - } - )EOF"; - - Json::ObjectPtr loader = Json::Factory::LoadFromString(json); - NiceMock cluster; - NiceMock dispatcher; - NiceMock runtime; - EXPECT_NE(nullptr, - DetectorImplFactory::createForCluster(cluster, *loader, dispatcher, runtime, nullptr)); -} - class CallbackChecker { public: MOCK_METHOD1(check, void(HostPtr host)); @@ -86,6 +65,28 @@ class OutlierDetectorImplTest : public testing::Test { Json::ObjectPtr loader_ = Json::Factory::LoadFromString("{}"); }; +TEST_F(OutlierDetectorImplTest, DetectorStaticConfig) { + std::string json = R"EOF( + { + "interval_ms" : 100, + "base_ejection_time_ms" : 10000, + "consecutive_5xx" : 10, + "max_ejection_percent" : 50, + "enforcing" : 10 + } + )EOF"; + + Json::ObjectPtr custom_config = Json::Factory::LoadFromString(json); + std::shared_ptr detector(DetectorImpl::create( + cluster_, custom_config, dispatcher_, runtime_, time_source_, event_logger_)); + + EXPECT_EQ(100UL, detector->config().intervalMs()); + EXPECT_EQ(10000UL, detector->config().baseEjectionTimeMs()); + EXPECT_EQ(10UL, detector->config().consecutive5xx()); + EXPECT_EQ(50UL, detector->config().maxEjectionPercent()); + EXPECT_EQ(10UL, detector->config().enforcing()); +} + TEST_F(OutlierDetectorImplTest, DestroyWithActive) { EXPECT_CALL(cluster_, addMemberUpdateCb(_)); cluster_.hosts_ = {HostPtr{new HostImpl( From a336355e0f937f4df6c2d99d35ae199de36c2d16 Mon Sep 17 00:00:00 2001 From: Shriram Rajagopalan Date: Tue, 7 Mar 2017 19:00:40 -0500 Subject: [PATCH 6/7] remove unnecessay {} initializers from const members --- source/common/upstream/outlier_detection_impl.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 17f669e2f1861..cf960e19c76ab 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -98,11 +98,11 @@ class DetectorConfig { uint64_t enforcing() { return enforcing_; } private: - const uint64_t interval_ms_{}; - const uint64_t base_ejection_time_ms_{}; - const uint64_t consecutive_5xx_{}; - const uint64_t max_ejection_percent_{}; - const uint64_t enforcing_{}; + const uint64_t interval_ms_; + const uint64_t base_ejection_time_ms_; + const uint64_t consecutive_5xx_; + const uint64_t max_ejection_percent_; + const uint64_t enforcing_; }; /** From 97209f3b7c4d5b440e3b8af023d525c358619785 Mon Sep 17 00:00:00 2001 From: Shriram Rajagopalan Date: Thu, 9 Mar 2017 14:09:07 -0500 Subject: [PATCH 7/7] Json ObjectPtr to Json Object ref --- .../common/upstream/outlier_detection_impl.cc | 20 ++++++++--------- .../common/upstream/outlier_detection_impl.h | 6 ++--- .../upstream/outlier_detection_impl_test.cc | 22 +++++++++---------- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 620fa8357c5ed..13bc0b06fc780 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -15,8 +15,8 @@ DetectorPtr DetectorImplFactory::createForCluster(Cluster& cluster, Runtime::Loader& runtime, EventLoggerPtr event_logger) { if (cluster_config.hasObject("outlier_detection")) { - return DetectorImpl::create(cluster, cluster_config.getObject("outlier_detection", false), - dispatcher, runtime, ProdSystemTimeSource::instance_, event_logger); + return DetectorImpl::create(cluster, *cluster_config.getObject("outlier_detection"), dispatcher, + runtime, ProdSystemTimeSource::instance_, event_logger); } else { return nullptr; } @@ -51,16 +51,16 @@ void DetectorHostSinkImpl::putHttpResponseCode(uint64_t response_code) { } } -DetectorConfig::DetectorConfig(const Json::ObjectPtr& json_config) - : interval_ms_(static_cast(json_config->getInteger("interval_ms", 10000))), +DetectorConfig::DetectorConfig(const Json::Object& json_config) + : interval_ms_(static_cast(json_config.getInteger("interval_ms", 10000))), base_ejection_time_ms_( - static_cast(json_config->getInteger("base_ejection_time_ms", 30000))), - consecutive_5xx_(static_cast(json_config->getInteger("consecutive_5xx", 5))), + static_cast(json_config.getInteger("base_ejection_time_ms", 30000))), + consecutive_5xx_(static_cast(json_config.getInteger("consecutive_5xx", 5))), max_ejection_percent_( - static_cast(json_config->getInteger("max_ejection_percent", 10))), - enforcing_(static_cast(json_config->getInteger("enforcing", 100))) {} + static_cast(json_config.getInteger("max_ejection_percent", 10))), + enforcing_(static_cast(json_config.getInteger("enforcing", 100))) {} -DetectorImpl::DetectorImpl(const Cluster& cluster, const Json::ObjectPtr& json_config, +DetectorImpl::DetectorImpl(const Cluster& cluster, const Json::Object& json_config, Event::Dispatcher& dispatcher, Runtime::Loader& runtime, SystemTimeSource& time_source, EventLoggerPtr event_logger) : config_(json_config), dispatcher_(dispatcher), runtime_(runtime), time_source_(time_source), @@ -78,7 +78,7 @@ DetectorImpl::~DetectorImpl() { } std::shared_ptr -DetectorImpl::create(const Cluster& cluster, const Json::ObjectPtr& json_config, +DetectorImpl::create(const Cluster& cluster, const Json::Object& json_config, Event::Dispatcher& dispatcher, Runtime::Loader& runtime, SystemTimeSource& time_source, EventLoggerPtr event_logger) { std::shared_ptr detector( diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index cf960e19c76ab..b1147f4834ed3 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -89,7 +89,7 @@ struct DetectionStats { */ class DetectorConfig { public: - DetectorConfig(const Json::ObjectPtr& json_config); + DetectorConfig(const Json::Object& json_config); uint64_t intervalMs() { return interval_ms_; } uint64_t baseEjectionTimeMs() { return base_ejection_time_ms_; } @@ -113,7 +113,7 @@ class DetectorConfig { class DetectorImpl : public Detector, public std::enable_shared_from_this { public: static std::shared_ptr - create(const Cluster& cluster, const Json::ObjectPtr& json_config, Event::Dispatcher& dispatcher, + create(const Cluster& cluster, const Json::Object& json_config, Event::Dispatcher& dispatcher, Runtime::Loader& runtime, SystemTimeSource& time_source, EventLoggerPtr event_logger); ~DetectorImpl(); @@ -125,7 +125,7 @@ class DetectorImpl : public Detector, public std::enable_shared_from_this detector(DetectorImpl::create( - cluster_, custom_config, dispatcher_, runtime_, time_source_, event_logger_)); + cluster_, *custom_config, dispatcher_, runtime_, time_source_, event_logger_)); EXPECT_EQ(100UL, detector->config().intervalMs()); EXPECT_EQ(10000UL, detector->config().baseEjectionTimeMs()); @@ -93,7 +93,7 @@ TEST_F(OutlierDetectorImplTest, DestroyWithActive) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); @@ -123,7 +123,7 @@ TEST_F(OutlierDetectorImplTest, DestroyHostInUse) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); detector.reset(); @@ -141,7 +141,7 @@ TEST_F(OutlierDetectorImplTest, BasicFlow) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); cluster_.hosts_.push_back(HostPtr{new HostImpl( @@ -198,7 +198,7 @@ TEST_F(OutlierDetectorImplTest, RemoveWhileEjected) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); @@ -237,7 +237,7 @@ TEST_F(OutlierDetectorImplTest, Overflow) { false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); ON_CALL(runtime_.snapshot_, getInteger("outlier_detection.max_ejection_percent", _)) @@ -275,7 +275,7 @@ TEST_F(OutlierDetectorImplTest, NotEnforcing) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); @@ -300,7 +300,7 @@ TEST_F(OutlierDetectorImplTest, CrossThreadRemoveRace) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); @@ -326,7 +326,7 @@ TEST_F(OutlierDetectorImplTest, CrossThreadDestroyRace) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); @@ -353,7 +353,7 @@ TEST_F(OutlierDetectorImplTest, CrossThreadFailRace) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); @@ -386,7 +386,7 @@ TEST_F(OutlierDetectorImplTest, Consecutive5xxAlreadyEjected) { cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, loader_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); // Cause a consecutive 5xx error.