From e12534a1283d1e019e29ef8bcf147571b31a96c9 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Wed, 8 Mar 2017 07:09:08 -0800 Subject: [PATCH 01/29] inital code --- include/envoy/upstream/outlier_detection.h | 6 +- .../common/upstream/outlier_detection_impl.cc | 64 ++++++++++++++++++- .../common/upstream/outlier_detection_impl.h | 20 +++++- 3 files changed, 86 insertions(+), 4 deletions(-) diff --git a/include/envoy/upstream/outlier_detection.h b/include/envoy/upstream/outlier_detection.h index fc8dc8be49e72..bba0bad2dbc47 100644 --- a/include/envoy/upstream/outlier_detection.h +++ b/include/envoy/upstream/outlier_detection.h @@ -52,7 +52,7 @@ class DetectorHostSink { typedef std::unique_ptr DetectorHostSinkPtr; -enum class EjectionType { Consecutive5xx }; +enum class EjectionType { Consecutive5xx, SuccessRate }; /** * Sink for outlier detection event logs. @@ -99,5 +99,9 @@ class Detector { typedef std::shared_ptr DetectorPtr; +struct SRAccumulatorBucket { + std::atomic success_rq_counter_; + std::atomic total_rq_counter_; +}; } // Outlier } // Upstream diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 4bb59f015b72e..43b48a25e6399 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -35,7 +35,12 @@ void DetectorHostSinkImpl::uneject(SystemTime unejection_time) { last_unejection_time_.value(unejection_time); } +void DetectorHostSinkImpl::updateCurrentSRBucket() { + sr_accumulator_bucket_.store(sr_accumulator_.getCurrentWriter()); +} + void DetectorHostSinkImpl::putHttpResponseCode(uint64_t response_code) { + sr_accumulator_bucket_.load()->total_rq_counter_++; if (Http::CodeUtility::is5xx(response_code)) { std::shared_ptr detector = detector_.lock(); if (!detector) { @@ -48,6 +53,7 @@ void DetectorHostSinkImpl::putHttpResponseCode(uint64_t response_code) { detector->onConsecutive5xx(host_.lock()); } } else { + sr_accumulator_bucket_.load()->success_rq_counter_++; consecutive_5xx_ = 0; } } @@ -200,8 +206,43 @@ void DetectorImpl::onConsecutive5xxWorker(HostPtr host) { void DetectorImpl::onIntervalTimer() { SystemTime now = time_source_.currentSystemTime(); + std::unordered_map valid_sr_hosts; + std::vector sr_data; + double sr_sum; + for (auto host : host_sinks_) { checkHostForUneject(host.first, host.second, now); + + // Success Rate Outlier Detection + // First swap out the current bucket been written to, to keep data valid + host.second->updateCurrentSRBucket(); + + // If there are not enough hosts to begin with, don't do the work. + if (host_sinks_.size() >= runtime_.snapshot().getInteger("outlier_detection.significant_host_threshold", 5)) { + Optional host_sr = host.second->srAccumulator().getSR(runtime_.snapshot().getInteger("outlier_detection.rq_volume_threshold", 100)); + if (host_sr.valid()) { + valid_sr_hosts[host.first] = host_sr.value(); + sr_data.emplace_back(host_sr.value()); + sr_sum += host_sr.value(); + } + } + } + + if (valid_sr_hosts.size() >= runtime_.snapshot().getInteger("outlier_detection.significant_host_threshold", 5)) { + + // Calculate the statistics (mean, stdev). We are using mean to detect outliers. + double mean = sr_sum / sr_data.size(); + double stdev = 0; + std::for_each(sr_data.begin(), sr_data.end(), [&stdev, mean](double& v){ stdev += std::pow(v - mean, 2); }); + stdev /= sr_data.size(); + stdev = std::sqrt(stdev); + + + for (auto host : valid_sr_hosts) { + if (host.second < mean - (2 * stdev)) { + ejectHost(host.first, EjectionType::SuccessRate); + } + } } armIntervalTimer(); @@ -256,8 +297,10 @@ void EventLoggerImpl::logUneject(HostDescriptionPtr host) { std::string EventLoggerImpl::typeToString(EjectionType type) { switch (type) { - case EjectionType::Consecutive5xx: - return "5xx"; + case EjectionType::Consecutive5xx: + return "5xx"; + case EjectionType::SuccessRate: + return "SR"; } NOT_IMPLEMENTED; @@ -271,5 +314,22 @@ int EventLoggerImpl::secsSinceLastAction(const Optional& lastActionT return -1; } +SRAccumulatorBucket* SRAccumulatorImpl::getCurrentWriter() { + // Right now current_ is being written to and backup_ is not. Flush the backup and swap + backup_sr_bucket_->success_rq_counter_ = 0; + backup_sr_bucket_->total_rq_counter_ = 0; + + current_sr_bucket_.swap(backup_sr_bucket_); + + return current_sr_bucket_.get(); +} + +Optional SRAccumulatorImpl::getSR(uint64_t rq_volume_thresh) { + if (backup_sr_bucket_->total_rq_counter_ < rq_volume_thresh) { + return Optional(); + } + + return Optional(backup_sr_bucket_->success_rq_counter_ * 100 /backup_sr_bucket_->total_rq_counter_); +} } // Outlier } // Upstream diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 64d38086e6842..810cdddf0eb9b 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -39,16 +39,32 @@ class DetectorImplFactory { class DetectorImpl; +class SRAccumulatorImpl { +public: + SRAccumulatorImpl() : current_sr_bucket_(new SRAccumulatorBucket()), backup_sr_bucket_(new SRAccumulatorBucket()) {}; + SRAccumulatorBucket* getCurrentWriter(); + Optional getSR(uint64_t rq_volume_threshold); + +private: + std::unique_ptr current_sr_bucket_; + std::unique_ptr backup_sr_bucket_; +}; + /** * Implementation of DetectorHostSink for the generic detector. */ class DetectorHostSinkImpl : public DetectorHostSink { public: DetectorHostSinkImpl(std::shared_ptr detector, HostPtr host) - : detector_(detector), host_(host) {} + : detector_(detector), host_(host) { + // Point the sr_accumulator_bucket_ pointer to a bucket. + updateCurrentSRBucket(); + } void eject(SystemTime ejection_time); void uneject(SystemTime ejection_time); + void updateCurrentSRBucket(); + SRAccumulatorImpl& srAccumulator() { return sr_accumulator_; }; // Upstream::Outlier::DetectorHostSink uint32_t numEjections() override { return num_ejections_; } @@ -64,6 +80,8 @@ class DetectorHostSinkImpl : public DetectorHostSink { Optional last_ejection_time_; Optional last_unejection_time_; uint32_t num_ejections_{}; + SRAccumulatorImpl sr_accumulator_; + std::atomic sr_accumulator_bucket_; }; /** From 2553e6c8000d204c78707d21638ccf1d6d11d28e Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Wed, 8 Mar 2017 07:10:03 -0800 Subject: [PATCH 02/29] fix format --- .../common/upstream/outlier_detection_impl.cc | 24 +++++++++++-------- .../common/upstream/outlier_detection_impl.h | 4 +++- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 43b48a25e6399..88c95cb0b47af 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -218,8 +218,10 @@ void DetectorImpl::onIntervalTimer() { host.second->updateCurrentSRBucket(); // If there are not enough hosts to begin with, don't do the work. - if (host_sinks_.size() >= runtime_.snapshot().getInteger("outlier_detection.significant_host_threshold", 5)) { - Optional host_sr = host.second->srAccumulator().getSR(runtime_.snapshot().getInteger("outlier_detection.rq_volume_threshold", 100)); + if (host_sinks_.size() >= + runtime_.snapshot().getInteger("outlier_detection.significant_host_threshold", 5)) { + Optional host_sr = host.second->srAccumulator().getSR( + runtime_.snapshot().getInteger("outlier_detection.rq_volume_threshold", 100)); if (host_sr.valid()) { valid_sr_hosts[host.first] = host_sr.value(); sr_data.emplace_back(host_sr.value()); @@ -228,16 +230,17 @@ void DetectorImpl::onIntervalTimer() { } } - if (valid_sr_hosts.size() >= runtime_.snapshot().getInteger("outlier_detection.significant_host_threshold", 5)) { + if (valid_sr_hosts.size() >= + runtime_.snapshot().getInteger("outlier_detection.significant_host_threshold", 5)) { // Calculate the statistics (mean, stdev). We are using mean to detect outliers. double mean = sr_sum / sr_data.size(); double stdev = 0; - std::for_each(sr_data.begin(), sr_data.end(), [&stdev, mean](double& v){ stdev += std::pow(v - mean, 2); }); + std::for_each(sr_data.begin(), sr_data.end(), + [&stdev, mean](double& v) { stdev += std::pow(v - mean, 2); }); stdev /= sr_data.size(); stdev = std::sqrt(stdev); - for (auto host : valid_sr_hosts) { if (host.second < mean - (2 * stdev)) { ejectHost(host.first, EjectionType::SuccessRate); @@ -297,10 +300,10 @@ void EventLoggerImpl::logUneject(HostDescriptionPtr host) { std::string EventLoggerImpl::typeToString(EjectionType type) { switch (type) { - case EjectionType::Consecutive5xx: - return "5xx"; - case EjectionType::SuccessRate: - return "SR"; + case EjectionType::Consecutive5xx: + return "5xx"; + case EjectionType::SuccessRate: + return "SR"; } NOT_IMPLEMENTED; @@ -329,7 +332,8 @@ Optional SRAccumulatorImpl::getSR(uint64_t rq_volume_thresh) { return Optional(); } - return Optional(backup_sr_bucket_->success_rq_counter_ * 100 /backup_sr_bucket_->total_rq_counter_); + return Optional(backup_sr_bucket_->success_rq_counter_ * 100 / + backup_sr_bucket_->total_rq_counter_); } } // Outlier } // Upstream diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 810cdddf0eb9b..7d12f4d4e1818 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -41,7 +41,9 @@ class DetectorImpl; class SRAccumulatorImpl { public: - SRAccumulatorImpl() : current_sr_bucket_(new SRAccumulatorBucket()), backup_sr_bucket_(new SRAccumulatorBucket()) {}; + SRAccumulatorImpl() + : current_sr_bucket_(new SRAccumulatorBucket()), + backup_sr_bucket_(new SRAccumulatorBucket()){}; SRAccumulatorBucket* getCurrentWriter(); Optional getSR(uint64_t rq_volume_threshold); From 65c8c8cfb4e9cb8b7784c45564f499cc69db3186 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Wed, 8 Mar 2017 09:39:07 -0700 Subject: [PATCH 03/29] add sr ejection counter --- source/common/upstream/outlier_detection_impl.cc | 1 + source/common/upstream/outlier_detection_impl.h | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 88c95cb0b47af..388095f62c5b5 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -243,6 +243,7 @@ void DetectorImpl::onIntervalTimer() { for (auto host : valid_sr_hosts) { if (host.second < mean - (2 * stdev)) { + stats_.ejections_consecutive_5xx_.inc(); ejectHost(host.first, EjectionType::SuccessRate); } } diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 7d12f4d4e1818..41e4531f1097d 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -94,7 +94,8 @@ class DetectorHostSinkImpl : public DetectorHostSink { COUNTER(ejections_total) \ GAUGE (ejections_active) \ COUNTER(ejections_overflow) \ - COUNTER(ejections_consecutive_5xx) + COUNTER(ejections_consecutive_5xx) \ + COUNTER(ejections_sr) // clang-format on /** From df94dde9bb52a0d544a417cf7b965710e6767a4d Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Wed, 8 Mar 2017 10:03:08 -0700 Subject: [PATCH 04/29] fix --- source/common/upstream/outlier_detection_impl.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 388095f62c5b5..79e7747479199 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -208,7 +208,7 @@ void DetectorImpl::onIntervalTimer() { SystemTime now = time_source_.currentSystemTime(); std::unordered_map valid_sr_hosts; std::vector sr_data; - double sr_sum; + double sr_sum = 0; for (auto host : host_sinks_) { checkHostForUneject(host.first, host.second, now); From 5cf0a52f84df9ea6b211cc94b9f4126c542954e5 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Thu, 9 Mar 2017 14:17:02 -0600 Subject: [PATCH 05/29] update --- include/envoy/upstream/outlier_detection.h | 4 --- .../common/upstream/outlier_detection_impl.cc | 5 ++-- .../common/upstream/outlier_detection_impl.h | 26 ++++++++++++++++++- .../upstream/outlier_detection_impl_test.cc | 23 ++++++++++++++-- 4 files changed, 49 insertions(+), 9 deletions(-) diff --git a/include/envoy/upstream/outlier_detection.h b/include/envoy/upstream/outlier_detection.h index bba0bad2dbc47..14b2ebc1b992f 100644 --- a/include/envoy/upstream/outlier_detection.h +++ b/include/envoy/upstream/outlier_detection.h @@ -99,9 +99,5 @@ class Detector { typedef std::shared_ptr DetectorPtr; -struct SRAccumulatorBucket { - std::atomic success_rq_counter_; - std::atomic total_rq_counter_; -}; } // Outlier } // Upstream diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 79e7747479199..ce1b98d7a02f1 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -243,7 +243,7 @@ void DetectorImpl::onIntervalTimer() { for (auto host : valid_sr_hosts) { if (host.second < mean - (2 * stdev)) { - stats_.ejections_consecutive_5xx_.inc(); + stats_.ejections_sr_.inc(); ejectHost(host.first, EjectionType::SuccessRate); } } @@ -319,7 +319,7 @@ int EventLoggerImpl::secsSinceLastAction(const Optional& lastActionT } SRAccumulatorBucket* SRAccumulatorImpl::getCurrentWriter() { - // Right now current_ is being written to and backup_ is not. Flush the backup and swap + // Right now current_ is being written to and backup_ is not. Flush the backup and swap. backup_sr_bucket_->success_rq_counter_ = 0; backup_sr_bucket_->total_rq_counter_ = 0; @@ -336,5 +336,6 @@ Optional SRAccumulatorImpl::getSR(uint64_t rq_volume_thresh) { return Optional(backup_sr_bucket_->success_rq_counter_ * 100 / backup_sr_bucket_->total_rq_counter_); } + } // Outlier } // Upstream diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 41e4531f1097d..012c9786356fd 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -37,14 +37,36 @@ class DetectorImplFactory { EventLoggerPtr event_logger); }; -class DetectorImpl; +struct SRAccumulatorBucket { + std::atomic success_rq_counter_; + std::atomic total_rq_counter_; +}; +/** + * The S(uccess)R(ate)AccumulatorImpl uses the SRAccumulatorBucket to get per host Success Rate + * stats. + * This implementation has a fixed window size of time, and thus only needs a bucket to write to, + * and + * a bucket to accumulate/run stats over. + */ class SRAccumulatorImpl { public: SRAccumulatorImpl() : current_sr_bucket_(new SRAccumulatorBucket()), backup_sr_bucket_(new SRAccumulatorBucket()){}; SRAccumulatorBucket* getCurrentWriter(); + /** + * This function returns the SR of a host over a window of time if the request volume is high + * enough. The underlying + * window of time could be dynamically adjusted. In the current implementation it is a fixed time + * window. + * @param rq_volume_threshold, the threshold of requests an accumulator has to have in order to be + * able to return + * a significant SR value. + * @return a valid Optional with the success rate. If there were not enough requests, an + * invalid Optional + * is returned. + */ Optional getSR(uint64_t rq_volume_threshold); private: @@ -52,6 +74,8 @@ class SRAccumulatorImpl { std::unique_ptr backup_sr_bucket_; }; +class DetectorImpl; + /** * Implementation of DetectorHostSink for the generic detector. */ diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index 940bf5b62edfe..4813a978880ea 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -161,11 +161,30 @@ TEST_F(OutlierDetectorImplTest, BasicFlow) { interval_timer_->callback_(); EXPECT_FALSE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + // Eject host again to cause an ejection after an unejection has taken place + cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(200); + cluster_.hosts_[0]->outlierDetector().putResponseTime(std::chrono::milliseconds(5)); + cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + + EXPECT_CALL(time_source_, currentSystemTime()) + .WillOnce(Return(SystemTime(std::chrono::milliseconds(40000)))); + EXPECT_CALL(checker_, check(cluster_.hosts_[0])); + EXPECT_CALL(*event_logger_, + logEject(std::static_pointer_cast(cluster_.hosts_[0]), + EjectionType::Consecutive5xx)); + cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); + cluster_.runCallbacks({}, cluster_.hosts_); EXPECT_EQ(0UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); - EXPECT_EQ(1UL, cluster_.info_->stats_store_.counter("outlier_detection.ejections_total").value()); - EXPECT_EQ(1UL, cluster_.info_->stats_store_.counter("outlier_detection.ejections_consecutive_5xx") + EXPECT_EQ(2UL, cluster_.info_->stats_store_.counter("outlier_detection.ejections_total").value()); + EXPECT_EQ(2UL, cluster_.info_->stats_store_.counter("outlier_detection.ejections_consecutive_5xx") .value()); } From ab66fa198c0597960cc7d0dd69547b535942a225 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Thu, 9 Mar 2017 14:32:04 -0600 Subject: [PATCH 06/29] update --- source/common/upstream/outlier_detection_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index f9dd65ead94aa..61c804cc5f963 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -53,7 +53,7 @@ class SRAccumulatorImpl { public: SRAccumulatorImpl() : current_sr_bucket_(new SRAccumulatorBucket()), - backup_sr_bucket_(new SRAccumulatorBucket()){}; + backup_sr_bucket_(new SRAccumulatorBucket()) {}; SRAccumulatorBucket* getCurrentWriter(); /** * This function returns the SR of a host over a window of time if the request volume is high From ad408f512ca36ba71daa2afa0d6c485aba6c4ed1 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Thu, 9 Mar 2017 16:00:04 -0600 Subject: [PATCH 07/29] add config values --- source/common/json/config_schemas.cc | 10 +++++++++ .../common/upstream/outlier_detection_impl.cc | 22 +++++++++++++------ .../common/upstream/outlier_detection_impl.h | 6 ++++- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/source/common/json/config_schemas.cc b/source/common/json/config_schemas.cc index 3e7ac2af15f34..09321478bea69 100644 --- a/source/common/json/config_schemas.cc +++ b/source/common/json/config_schemas.cc @@ -1069,6 +1069,16 @@ const std::string Json::Schema::CLUSTER_SCHEMA(R"EOF( "minimum" : 0, "exclusiveMinimum" : true }, + "significant_host_threshold" : { + "type" : "integer", + "minimum" : 0, + "exclusiveMinimum" : true + }, + "rq_volume_threshold" : { + "type" : "integer", + "minimum" : 0, + "exclusiveMinimum" : true + }, "interval_ms" : { "type" : "integer", "minimum" : 0, diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index ee828682a41c7..2e40738bbf7bc 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -64,7 +64,11 @@ DetectorConfig::DetectorConfig(const Json::Object& json_config) consecutive_5xx_(static_cast(json_config.getInteger("consecutive_5xx", 5))), max_ejection_percent_( static_cast(json_config.getInteger("max_ejection_percent", 10))), - enforcing_(static_cast(json_config.getInteger("enforcing", 100))) {} + enforcing_(static_cast(json_config.getInteger("enforcing", 100))), + significant_host_threshold_( + static_cast(json_config.getInteger("significant_host_threshold", 5))), + rq_volume_threshold_( + static_cast(json_config.getInteger("rq_volume_threshold", 100))) {} DetectorImpl::DetectorImpl(const Cluster& cluster, const Json::Object& json_config, Event::Dispatcher& dispatcher, Runtime::Loader& runtime, @@ -228,9 +232,12 @@ void DetectorImpl::onIntervalTimer() { // If there are not enough hosts to begin with, don't do the work. if (host_sinks_.size() >= - runtime_.snapshot().getInteger("outlier_detection.significant_host_threshold", 5)) { - Optional host_sr = host.second->srAccumulator().getSR( - runtime_.snapshot().getInteger("outlier_detection.rq_volume_threshold", 100)); + runtime_.snapshot().getInteger("outlier_detection.significant_host_threshold", + config_.significantHostThreshold())) { + + Optional host_sr = host.second->srAccumulator().getSR(runtime_.snapshot().getInteger( + "outlier_detection.rq_volume_threshold", config_.rqVolumeThreshold())); + if (host_sr.valid()) { valid_sr_hosts[host.first] = host_sr.value(); sr_data.emplace_back(host_sr.value()); @@ -240,7 +247,8 @@ void DetectorImpl::onIntervalTimer() { } if (valid_sr_hosts.size() >= - runtime_.snapshot().getInteger("outlier_detection.significant_host_threshold", 5)) { + runtime_.snapshot().getInteger("outlier_detection.significant_host_threshold", + config_.rqVolumeThreshold())) { // Calculate the statistics (mean, stdev). We are using mean to detect outliers. double mean = sr_sum / sr_data.size(); @@ -337,8 +345,8 @@ SRAccumulatorBucket* SRAccumulatorImpl::getCurrentWriter() { return current_sr_bucket_.get(); } -Optional SRAccumulatorImpl::getSR(uint64_t rq_volume_thresh) { - if (backup_sr_bucket_->total_rq_counter_ < rq_volume_thresh) { +Optional SRAccumulatorImpl::getSR(uint64_t rq_volume_threshold) { + if (backup_sr_bucket_->total_rq_counter_ < rq_volume_threshold) { return Optional(); } diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 61c804cc5f963..3eb0e423f5bc6 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -53,7 +53,7 @@ class SRAccumulatorImpl { public: SRAccumulatorImpl() : current_sr_bucket_(new SRAccumulatorBucket()), - backup_sr_bucket_(new SRAccumulatorBucket()) {}; + backup_sr_bucket_(new SRAccumulatorBucket()){}; SRAccumulatorBucket* getCurrentWriter(); /** * This function returns the SR of a host over a window of time if the request volume is high @@ -141,6 +141,8 @@ class DetectorConfig { uint64_t consecutive5xx() { return consecutive_5xx_; } uint64_t maxEjectionPercent() { return max_ejection_percent_; } uint64_t enforcing() { return enforcing_; } + uint64_t significantHostThreshold() { return significant_host_threshold_; } + uint64_t rqVolumeThreshold() { return rq_volume_threshold_; } private: const uint64_t interval_ms_; @@ -148,6 +150,8 @@ class DetectorConfig { const uint64_t consecutive_5xx_; const uint64_t max_ejection_percent_; const uint64_t enforcing_; + const uint64_t significant_host_threshold_; + const uint64_t rq_volume_threshold_; }; /** From 425556099ecdb5c519a3a5a2ddda18714e3d43cc Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Thu, 9 Mar 2017 16:14:04 -0600 Subject: [PATCH 08/29] fix format --- source/common/upstream/outlier_detection_impl.cc | 2 +- source/common/upstream/outlier_detection_impl.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 2e40738bbf7bc..0b47023a72580 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -259,7 +259,7 @@ void DetectorImpl::onIntervalTimer() { stdev = std::sqrt(stdev); for (auto host : valid_sr_hosts) { - if (host.second < mean - (2 * stdev)) { + if (host.second < mean - (sr_stdev_factor_ * stdev)) { stats_.ejections_sr_.inc(); ejectHost(host.first, EjectionType::SuccessRate); } diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 3eb0e423f5bc6..31b4537619ae4 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -197,6 +197,8 @@ class DetectorImpl : public Detector, public std::enable_shared_from_this callbacks_; std::unordered_map host_sinks_; EventLoggerPtr event_logger_; + // Factor to multiply the stdev of a cluster's Success Rate for success rate outlier ejection. + static const uint64_t sr_stdev_factor_ = 2; }; class EventLoggerImpl : public EventLogger { From 5ec0abe8a8dd4090f25352ef08b0dba86523a521 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Thu, 9 Mar 2017 17:09:18 -0600 Subject: [PATCH 09/29] move stats into function --- .../common/upstream/outlier_detection_impl.cc | 22 ++++++++++--------- .../common/upstream/outlier_detection_impl.h | 1 + 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 0b47023a72580..756fcda9598a1 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -217,6 +217,17 @@ void DetectorImpl::onConsecutive5xxWorker(HostPtr host) { ejectHost(host, EjectionType::Consecutive5xx); } +double DetectorImpl::srEjectionThreshold(double sr_sum, std::vector& sr_data) { + double mean = sr_sum / sr_data.size(); + double stdev = 0; + std::for_each(sr_data.begin(), sr_data.end(), + [&stdev, mean](double& v) { stdev += std::pow(v - mean, 2); }); + stdev /= sr_data.size(); + stdev = std::sqrt(stdev); + + return mean - (sr_stdev_factor_ * stdev); +} + void DetectorImpl::onIntervalTimer() { SystemTime now = time_source_.currentSystemTime(); std::unordered_map valid_sr_hosts; @@ -249,17 +260,8 @@ void DetectorImpl::onIntervalTimer() { if (valid_sr_hosts.size() >= runtime_.snapshot().getInteger("outlier_detection.significant_host_threshold", config_.rqVolumeThreshold())) { - - // Calculate the statistics (mean, stdev). We are using mean to detect outliers. - double mean = sr_sum / sr_data.size(); - double stdev = 0; - std::for_each(sr_data.begin(), sr_data.end(), - [&stdev, mean](double& v) { stdev += std::pow(v - mean, 2); }); - stdev /= sr_data.size(); - stdev = std::sqrt(stdev); - for (auto host : valid_sr_hosts) { - if (host.second < mean - (sr_stdev_factor_ * stdev)) { + if (host.second < srEjectionThreshold(sr_sum, sr_data)) { stats_.ejections_sr_.inc(); ejectHost(host.first, EjectionType::SuccessRate); } diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 31b4537619ae4..36c348fef048f 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -187,6 +187,7 @@ class DetectorImpl : public Detector, public std::enable_shared_from_this& sr_data); DetectorConfig config_; Event::Dispatcher& dispatcher_; From 9753ca20b8faef2396f564757deb006b715fdd43 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Thu, 16 Mar 2017 14:51:56 -0700 Subject: [PATCH 10/29] test, config, enforcing, docs --- .../configuration/cluster_manager/cluster.rst | 29 ++++++++++++++++--- .../cluster_manager/cluster_runtime.rst | 22 ++++++++++++-- docs/intro/arch_overview/outlier.rst | 14 ++++++++- source/common/json/config_schemas.cc | 7 ++++- .../common/upstream/outlier_detection_impl.cc | 21 ++++++++++++-- .../common/upstream/outlier_detection_impl.h | 7 +++-- .../upstream/outlier_detection_impl_test.cc | 16 ++++++---- 7 files changed, 97 insertions(+), 19 deletions(-) diff --git a/docs/configuration/cluster_manager/cluster.rst b/docs/configuration/cluster_manager/cluster.rst index e0337439b8256..938b8196c1fa6 100644 --- a/docs/configuration/cluster_manager/cluster.rst +++ b/docs/configuration/cluster_manager/cluster.rst @@ -163,11 +163,32 @@ outlier_detection max_ejection_percent The maximum % of an upstream cluster that can be ejected due to outlier detection. Defaults to 10%. - .. _config_cluster_manager_cluster_outlier_detection_enforcing: + .. _config_cluster_manager_cluster_outlier_detection_enforcing_consecutive_5xx: - enforcing - The % chance that a host will be actually ejected when an outlier status is detected. This setting - can be used to disable ejection or to ramp it up slowly. Defaults to 100. + enforcing_consecutive_5xx + The % chance that a host will be actually ejected when an outlier status is detected through + consecutive 5xx. This setting can be used to disable ejection or to ramp it up slowly. Defaults to 100. + + .. _config_cluster_manager_cluster_outlier_detection_enforcing_sr: + + enforcing_sr + The % chance that a host will be actually ejected when an outlier status is detected through + success rate statistics. This setting can be used to disable ejection or to ramp it up slowly. + Defaults to 100. + + .. _config_cluster_manager_cluster_outlier_detection_significant_host_threshold: + + significant_host_threshold + The number of hosts in a cluster that must have enough request volume to detect success rate outliers. + If the number of hosts is less than this setting, outlier detection via success rate statistics is not + performed for any host in the cluster. Defaults to 5. + + .. _config_cluster_manager_cluster_outlier_detection_rq_volume_threshold: + + rq_volume_threshold: + The number of total requests that must be collected in one interval to include this host in success rate + based outlier detection. If the volume is lower than this setting, outlier detection via success rate + statistics is not performed for that host. Defaults to 100. Each of the above configuration values can be overridden via :ref:`runtime values `. diff --git a/docs/configuration/cluster_manager/cluster_runtime.rst b/docs/configuration/cluster_manager/cluster_runtime.rst index 427fe57da3fc5..5f64c7986221a 100644 --- a/docs/configuration/cluster_manager/cluster_runtime.rst +++ b/docs/configuration/cluster_manager/cluster_runtime.rst @@ -52,11 +52,27 @@ outlier_detection.max_ejection_percent ` setting in outlier detection -outlier_detection.enforcing - :ref:`enforcing - ` +outlier_detection.enforcing_consecutive_5xx + :ref:`enforcing_consecutive_5xx + ` setting in outlier detection +outlier_detection.enforcing_sr + :ref:`enforcing_sr + ` + setting in outlier detection + +outlier_detection.significant_host_threshold + :ref:`significant_host_threshold + ` + setting in outlier detection + +outlier_detection.rq_volume_threshold + :ref:`rq_volume_threshold + ` + setting in outlier detection + + Core ---- diff --git a/docs/intro/arch_overview/outlier.rst b/docs/intro/arch_overview/outlier.rst index ee333648e6550..d2dcb6d731b6a 100644 --- a/docs/intro/arch_overview/outlier.rst +++ b/docs/intro/arch_overview/outlier.rst @@ -46,7 +46,19 @@ If an upstream host returns some number of consecutive 5xx, it will be ejected. case a 5xx means an actual 5xx respond code, or an event that would cause the HTTP router to return one on the upstream's behalf (reset, connection failure, etc.). The number of consecutive 5xx required for ejection is controlled by the :ref:`outlier_detection.consecutive_5xx -` value. +` value. + +Success Rate +^^^^^^^^^^^^ + +Success Rate based outlier ejection aggregates success rate data from every host in a cluster, and at a given +interval ejects hosts based on statistical outlier detection. Success Rate outlier ejection will not be +calculated for a host if its request volume over the aggregation interval is less than the +:ref:`outlier_detection.rq_volume_threshold` +value. Moreover, detection will not be performed for a cluster if the number of hosts +with enough request volume in an interval is less than the +:ref:`outlier_detection.significant_host_threshold` +value. Ejection event logging ---------------------- diff --git a/source/common/json/config_schemas.cc b/source/common/json/config_schemas.cc index 43c991db08346..5cd7e0624c219 100644 --- a/source/common/json/config_schemas.cc +++ b/source/common/json/config_schemas.cc @@ -1109,7 +1109,12 @@ const std::string Json::Schema::CLUSTER_SCHEMA(R"EOF( "minimum" : 0, "maximum" : 100 }, - "enforcing" : { + "enforcing_consecutive_5xx" : { + "type" : "integer", + "minimum" : 0, + "maximum" : 100 + }, + "enforcing_sr" : { "type" : "integer", "minimum" : 0, "maximum" : 100 diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 756fcda9598a1..b97564ee67a71 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -64,11 +64,13 @@ DetectorConfig::DetectorConfig(const Json::Object& json_config) consecutive_5xx_(static_cast(json_config.getInteger("consecutive_5xx", 5))), max_ejection_percent_( static_cast(json_config.getInteger("max_ejection_percent", 10))), - enforcing_(static_cast(json_config.getInteger("enforcing", 100))), significant_host_threshold_( static_cast(json_config.getInteger("significant_host_threshold", 5))), rq_volume_threshold_( - static_cast(json_config.getInteger("rq_volume_threshold", 100))) {} + static_cast(json_config.getInteger("rq_volume_threshold", 100))), + enforcing_consecutive_5xx_( + static_cast(json_config.getInteger("enforcing_consecutive_5xx", 100))), + enforcing_sr_(static_cast(json_config.getInteger("enforcing_sr", 100))) {} DetectorImpl::DetectorImpl(const Cluster& cluster, const Json::Object& json_config, Event::Dispatcher& dispatcher, Runtime::Loader& runtime, @@ -155,6 +157,19 @@ void DetectorImpl::checkHostForUneject(HostPtr host, DetectorHostSinkImpl* sink, } } +bool DetectorImpl::enforceEjection(EjectionType type) { + switch (type) { + case EjectionType::Consecutive5xx: + return runtime_.snapshot().featureEnabled("outlier_detection.enforcing_consecutive_5xx", + config_.enforcingConsecutive5xx()); + case EjectionType::SuccessRate: + return runtime_.snapshot().featureEnabled("outlier_detection.enforcing_sr", + config_.enforcingSR()); + } + + NOT_IMPLEMENTED; +} + void DetectorImpl::ejectHost(HostPtr host, EjectionType type) { uint64_t max_ejection_percent = std::min( 100, runtime_.snapshot().getInteger("outlier_detection.max_ejection_percent", @@ -162,7 +177,7 @@ void DetectorImpl::ejectHost(HostPtr host, EjectionType type) { double ejected_percent = 100.0 * stats_.ejections_active_.value() / host_sinks_.size(); if (ejected_percent < max_ejection_percent) { stats_.ejections_total_.inc(); - if (runtime_.snapshot().featureEnabled("outlier_detection.enforcing", config_.enforcing())) { + if (enforceEjection(type)) { stats_.ejections_active_.inc(); host_sinks_[host]->eject(time_source_.currentSystemTime()); runCallbacks(host); diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 36c348fef048f..74a6e33bf0b01 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -140,18 +140,20 @@ class DetectorConfig { uint64_t baseEjectionTimeMs() { return base_ejection_time_ms_; } uint64_t consecutive5xx() { return consecutive_5xx_; } uint64_t maxEjectionPercent() { return max_ejection_percent_; } - uint64_t enforcing() { return enforcing_; } uint64_t significantHostThreshold() { return significant_host_threshold_; } uint64_t rqVolumeThreshold() { return rq_volume_threshold_; } + uint64_t enforcingConsecutive5xx() { return enforcing_consecutive_5xx_; } + uint64_t enforcingSR() { return enforcing_sr_; } private: const uint64_t interval_ms_; const uint64_t base_ejection_time_ms_; const uint64_t consecutive_5xx_; const uint64_t max_ejection_percent_; - const uint64_t enforcing_; const uint64_t significant_host_threshold_; const uint64_t rq_volume_threshold_; + const uint64_t enforcing_consecutive_5xx_; + const uint64_t enforcing_sr_; }; /** @@ -188,6 +190,7 @@ class DetectorImpl : public Detector, public std::enable_shared_from_this& sr_data); + bool enforceEjection(EjectionType type); DetectorConfig config_; Event::Dispatcher& dispatcher_; diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index 8309f1668a300..35eb2624cb704 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -51,12 +51,16 @@ class CallbackChecker { class OutlierDetectorImplTest : public testing::Test { public: OutlierDetectorImplTest() { - ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing", 100)) + ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_consecutive_5xx", 100)) + .WillByDefault(Return(true)); + ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_sr", 100)) .WillByDefault(Return(true)); } NiceMock cluster_; - NiceMock dispatcher_; + NiceMock dispatcher_; NiceMock runtime_; Event::MockTimer* interval_timer_ = new Event::MockTimer(&dispatcher_); CallbackChecker checker_; @@ -72,7 +76,8 @@ TEST_F(OutlierDetectorImplTest, DetectorStaticConfig) { "base_ejection_time_ms" : 10000, "consecutive_5xx" : 10, "max_ejection_percent" : 50, - "enforcing" : 10 + "enforcing_consecutive_5xx" : 10, + "enforcing_sr": 20 } )EOF"; @@ -84,7 +89,8 @@ TEST_F(OutlierDetectorImplTest, DetectorStaticConfig) { EXPECT_EQ(10000UL, detector->config().baseEjectionTimeMs()); EXPECT_EQ(10UL, detector->config().consecutive5xx()); EXPECT_EQ(50UL, detector->config().maxEjectionPercent()); - EXPECT_EQ(10UL, detector->config().enforcing()); + EXPECT_EQ(10UL, detector->config().enforcingConsecutive5xx()); + EXPECT_EQ(20UL, detector->config().enforcingSR()); } TEST_F(OutlierDetectorImplTest, DestroyWithActive) { @@ -302,7 +308,7 @@ TEST_F(OutlierDetectorImplTest, NotEnforcing) { cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing", 100)) + ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_consecutive_5xx", 100)) .WillByDefault(Return(false)); cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); EXPECT_FALSE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); From a4e588d10ed53ca6b2695b2479ed0c89a0b2d3c5 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Fri, 17 Mar 2017 16:53:34 -0700 Subject: [PATCH 11/29] tests --- .../common/upstream/outlier_detection_impl.cc | 7 +- .../common/upstream/outlier_detection_impl.h | 2 +- .../upstream/outlier_detection_impl_test.cc | 87 +++++++++++++++++-- 3 files changed, 85 insertions(+), 11 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index b97564ee67a71..08e2e7bf457d2 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -248,7 +248,6 @@ void DetectorImpl::onIntervalTimer() { std::unordered_map valid_sr_hosts; std::vector sr_data; double sr_sum = 0; - for (auto host : host_sinks_) { checkHostForUneject(host.first, host.second, now); @@ -260,7 +259,6 @@ void DetectorImpl::onIntervalTimer() { if (host_sinks_.size() >= runtime_.snapshot().getInteger("outlier_detection.significant_host_threshold", config_.significantHostThreshold())) { - Optional host_sr = host.second->srAccumulator().getSR(runtime_.snapshot().getInteger( "outlier_detection.rq_volume_threshold", config_.rqVolumeThreshold())); @@ -274,9 +272,10 @@ void DetectorImpl::onIntervalTimer() { if (valid_sr_hosts.size() >= runtime_.snapshot().getInteger("outlier_detection.significant_host_threshold", - config_.rqVolumeThreshold())) { + config_.significantHostThreshold())) { + double ejection_threshold = srEjectionThreshold(sr_sum, sr_data); for (auto host : valid_sr_hosts) { - if (host.second < srEjectionThreshold(sr_sum, sr_data)) { + if (host.second < ejection_threshold) { stats_.ejections_sr_.inc(); ejectHost(host.first, EjectionType::SuccessRate); } diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 74a6e33bf0b01..bd3510bb224bf 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -202,7 +202,7 @@ class DetectorImpl : public Detector, public std::enable_shared_from_this host_sinks_; EventLoggerPtr event_logger_; // Factor to multiply the stdev of a cluster's Success Rate for success rate outlier ejection. - static const uint64_t sr_stdev_factor_ = 2; + static constexpr double sr_stdev_factor_ = 1.9; }; class EventLoggerImpl : public EventLogger { diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index 35eb2624cb704..15de47e8967f6 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -58,9 +58,7 @@ class OutlierDetectorImplTest : public testing::Test { } NiceMock cluster_; - NiceMock dispatcher_; + NiceMock dispatcher_; NiceMock runtime_; Event::MockTimer* interval_timer_ = new Event::MockTimer(&dispatcher_); CallbackChecker checker_; @@ -141,7 +139,7 @@ TEST_F(OutlierDetectorImplTest, DestroyHostInUse) { cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); } -TEST_F(OutlierDetectorImplTest, BasicFlow) { +TEST_F(OutlierDetectorImplTest, BasicFlow5xx) { EXPECT_CALL(cluster_, addMemberUpdateCb(_)); cluster_.hosts_ = {HostPtr{new HostImpl( cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; @@ -217,6 +215,83 @@ TEST_F(OutlierDetectorImplTest, BasicFlow) { .value()); } +TEST_F(OutlierDetectorImplTest, BasicFlowSR) { + EXPECT_CALL(cluster_, addMemberUpdateCb(_)); + cluster_.hosts_ = {HostPtr{new HostImpl( + cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); + std::shared_ptr detector( + DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); + detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); + + for (int i = 81; i < 85; ++i) { + cluster_.hosts_.push_back(HostPtr{new HostImpl( + cluster_.info_, "", Network::Utility::resolveUrl(fmt::format("tcp://127.0.0.1:{}", i)), false, 1, "")}); + } + + cluster_.runCallbacks({cluster_.hosts_[1], cluster_.hosts_[2], cluster_.hosts_[3], cluster_.hosts_[4]}, {}); + + // Cause a consecutive SR error on one host. First have 4 of the hosts have perfect SR. + for (uint64_t i = 0; i < cluster_.hosts_.size() - 1; i++) { + for (int j = 0; j < 200; j++) { + cluster_.hosts_[i]->outlierDetector().putHttpResponseCode(200); + } + } + + for (int i = 0; i < 100; i++) { + cluster_.hosts_[4]->outlierDetector().putHttpResponseCode(200); + cluster_.hosts_[4]->outlierDetector().putHttpResponseCode(503); + } + + EXPECT_CALL(time_source_, currentSystemTime()) + .Times(2).WillRepeatedly(Return(SystemTime(std::chrono::milliseconds(10000)))); + EXPECT_CALL(checker_, check(cluster_.hosts_[4])); + EXPECT_CALL(*event_logger_, + logEject(std::static_pointer_cast(cluster_.hosts_[4]), + EjectionType::SuccessRate)); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); + interval_timer_->callback_(); + EXPECT_TRUE(cluster_.hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); + + // Interval that doesn't bring the host back in. + EXPECT_CALL(time_source_, currentSystemTime()) + .WillOnce(Return(SystemTime(std::chrono::milliseconds(19999)))); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); + interval_timer_->callback_(); + EXPECT_TRUE(cluster_.hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); + + // Interval that does bring the host back in. + EXPECT_CALL(time_source_, currentSystemTime()) + .WillOnce(Return(SystemTime(std::chrono::milliseconds(50001)))); + EXPECT_CALL(checker_, check(cluster_.hosts_[4])); + EXPECT_CALL(*event_logger_, + logUneject(std::static_pointer_cast(cluster_.hosts_[4]))); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); + interval_timer_->callback_(); + EXPECT_FALSE(cluster_.hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(0UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); + + // Give 4 hosts enough request volume but not to the 5th. Should not cause an ejection. + for (uint64_t i = 0; i < cluster_.hosts_.size() - 1; i++) { + for (int j = 0; j < 150; j++) { + cluster_.hosts_[i]->outlierDetector().putHttpResponseCode(200); + } + } + + for (int i = 0; i < 25; i++) { + cluster_.hosts_[4]->outlierDetector().putHttpResponseCode(200); + cluster_.hosts_[4]->outlierDetector().putHttpResponseCode(503); + } + + EXPECT_CALL(time_source_, currentSystemTime()) + .WillOnce(Return(SystemTime(std::chrono::milliseconds(60001)))); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); + interval_timer_->callback_(); + EXPECT_EQ(0UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); +} + TEST_F(OutlierDetectorImplTest, RemoveWhileEjected) { EXPECT_CALL(cluster_, addMemberUpdateCb(_)); cluster_.hosts_ = {HostPtr{new HostImpl( @@ -477,9 +552,9 @@ TEST(OutlierDetectionEventLoggerImplTest, All) { EXPECT_CALL(*file, write("{\"time\": \"1970-01-01T00:00:00.000Z\", \"secs_since_last_action\": " "\"30\", \"cluster\": " "\"fake_cluster\", \"upstream_url\": \"10.0.0.1:443\", \"action\": " - "\"eject\", \"type\": \"5xx\", \"num_ejections\": 0}\n")) + "\"eject\", \"type\": \"SR\", \"num_ejections\": 0}\n")) .WillOnce(SaveArg<0>(&log3)); - event_logger.logEject(host, EjectionType::Consecutive5xx); + event_logger.logEject(host, EjectionType::SuccessRate); Json::Factory::LoadFromString(log3); std::string log4; From 77b407b074418ff9dffdf6336e67c25c1ba6a7d3 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Fri, 17 Mar 2017 16:54:05 -0700 Subject: [PATCH 12/29] tests --- .../upstream/outlier_detection_impl_test.cc | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index 15de47e8967f6..cbe2d1e52c6eb 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -218,18 +218,20 @@ TEST_F(OutlierDetectorImplTest, BasicFlow5xx) { TEST_F(OutlierDetectorImplTest, BasicFlowSR) { EXPECT_CALL(cluster_, addMemberUpdateCb(_)); cluster_.hosts_ = {HostPtr{new HostImpl( - cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; + cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( - DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); + DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); for (int i = 81; i < 85; ++i) { cluster_.hosts_.push_back(HostPtr{new HostImpl( - cluster_.info_, "", Network::Utility::resolveUrl(fmt::format("tcp://127.0.0.1:{}", i)), false, 1, "")}); + cluster_.info_, "", Network::Utility::resolveUrl(fmt::format("tcp://127.0.0.1:{}", i)), + false, 1, "")}); } - cluster_.runCallbacks({cluster_.hosts_[1], cluster_.hosts_[2], cluster_.hosts_[3], cluster_.hosts_[4]}, {}); + cluster_.runCallbacks( + {cluster_.hosts_[1], cluster_.hosts_[2], cluster_.hosts_[3], cluster_.hosts_[4]}, {}); // Cause a consecutive SR error on one host. First have 4 of the hosts have perfect SR. for (uint64_t i = 0; i < cluster_.hosts_.size() - 1; i++) { @@ -244,7 +246,8 @@ TEST_F(OutlierDetectorImplTest, BasicFlowSR) { } EXPECT_CALL(time_source_, currentSystemTime()) - .Times(2).WillRepeatedly(Return(SystemTime(std::chrono::milliseconds(10000)))); + .Times(2) + .WillRepeatedly(Return(SystemTime(std::chrono::milliseconds(10000)))); EXPECT_CALL(checker_, check(cluster_.hosts_[4])); EXPECT_CALL(*event_logger_, logEject(std::static_pointer_cast(cluster_.hosts_[4]), @@ -256,7 +259,7 @@ TEST_F(OutlierDetectorImplTest, BasicFlowSR) { // Interval that doesn't bring the host back in. EXPECT_CALL(time_source_, currentSystemTime()) - .WillOnce(Return(SystemTime(std::chrono::milliseconds(19999)))); + .WillOnce(Return(SystemTime(std::chrono::milliseconds(19999)))); EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); interval_timer_->callback_(); EXPECT_TRUE(cluster_.hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); @@ -264,7 +267,7 @@ TEST_F(OutlierDetectorImplTest, BasicFlowSR) { // Interval that does bring the host back in. EXPECT_CALL(time_source_, currentSystemTime()) - .WillOnce(Return(SystemTime(std::chrono::milliseconds(50001)))); + .WillOnce(Return(SystemTime(std::chrono::milliseconds(50001)))); EXPECT_CALL(checker_, check(cluster_.hosts_[4])); EXPECT_CALL(*event_logger_, logUneject(std::static_pointer_cast(cluster_.hosts_[4]))); @@ -286,7 +289,7 @@ TEST_F(OutlierDetectorImplTest, BasicFlowSR) { } EXPECT_CALL(time_source_, currentSystemTime()) - .WillOnce(Return(SystemTime(std::chrono::milliseconds(60001)))); + .WillOnce(Return(SystemTime(std::chrono::milliseconds(60001)))); EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); interval_timer_->callback_(); EXPECT_EQ(0UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); From aaac3f32bf54d89144c252008fd0c7bc7a2ff484 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Tue, 21 Mar 2017 12:02:50 -0700 Subject: [PATCH 13/29] Update comments --- .../configuration/cluster_manager/cluster.rst | 7 ++-- .../cluster_manager/cluster_runtime.rst | 1 - docs/intro/arch_overview/outlier.rst | 2 +- .../common/upstream/outlier_detection_impl.cc | 14 ++++---- .../common/upstream/outlier_detection_impl.h | 35 +++++++++++++------ .../upstream/outlier_detection_impl_test.cc | 7 ++++ 6 files changed, 43 insertions(+), 23 deletions(-) diff --git a/docs/configuration/cluster_manager/cluster.rst b/docs/configuration/cluster_manager/cluster.rst index c5b54243304d8..308f9b9c65908 100644 --- a/docs/configuration/cluster_manager/cluster.rst +++ b/docs/configuration/cluster_manager/cluster.rst @@ -191,9 +191,10 @@ outlier_detection .. _config_cluster_manager_cluster_outlier_detection_rq_volume_threshold: rq_volume_threshold: - The number of total requests that must be collected in one interval to include this host in success rate - based outlier detection. If the volume is lower than this setting, outlier detection via success rate - statistics is not performed for that host. Defaults to 100. + The minimum number of total requests that must be collected in one interval + (as defined by :ref:`interval_ms ` above) + to include this host in success rate based outlier detection. If the volume is lower than this setting, + outlier detection via success rate statistics is not performed for that host. Defaults to 100. Each of the above configuration values can be overridden via :ref:`runtime values `. diff --git a/docs/configuration/cluster_manager/cluster_runtime.rst b/docs/configuration/cluster_manager/cluster_runtime.rst index 5f64c7986221a..ae526eec0a200 100644 --- a/docs/configuration/cluster_manager/cluster_runtime.rst +++ b/docs/configuration/cluster_manager/cluster_runtime.rst @@ -72,7 +72,6 @@ outlier_detection.rq_volume_threshold ` setting in outlier detection - Core ---- diff --git a/docs/intro/arch_overview/outlier.rst b/docs/intro/arch_overview/outlier.rst index d2dcb6d731b6a..1e8e65f0614d2 100644 --- a/docs/intro/arch_overview/outlier.rst +++ b/docs/intro/arch_overview/outlier.rst @@ -56,7 +56,7 @@ interval ejects hosts based on statistical outlier detection. Success Rate outli calculated for a host if its request volume over the aggregation interval is less than the :ref:`outlier_detection.rq_volume_threshold` value. Moreover, detection will not be performed for a cluster if the number of hosts -with enough request volume in an interval is less than the +with the minimum required request volume in an interval is less than the :ref:`outlier_detection.significant_host_threshold` value. diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 08e2e7bf457d2..3879d2de9640a 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -167,7 +167,7 @@ bool DetectorImpl::enforceEjection(EjectionType type) { config_.enforcingSR()); } - NOT_IMPLEMENTED; + NOT_REACHED; } void DetectorImpl::ejectHost(HostPtr host, EjectionType type) { @@ -232,7 +232,7 @@ void DetectorImpl::onConsecutive5xxWorker(HostPtr host) { ejectHost(host, EjectionType::Consecutive5xx); } -double DetectorImpl::srEjectionThreshold(double sr_sum, std::vector& sr_data) { +double Utility::srEjectionThreshold(double sr_sum, std::vector& sr_data) { double mean = sr_sum / sr_data.size(); double stdev = 0; std::for_each(sr_data.begin(), sr_data.end(), @@ -240,7 +240,7 @@ double DetectorImpl::srEjectionThreshold(double sr_sum, std::vector& sr_ stdev /= sr_data.size(); stdev = std::sqrt(stdev); - return mean - (sr_stdev_factor_ * stdev); + return mean - (SR_STDEV_FACTOR * stdev); } void DetectorImpl::onIntervalTimer() { @@ -251,8 +251,8 @@ void DetectorImpl::onIntervalTimer() { for (auto host : host_sinks_) { checkHostForUneject(host.first, host.second, now); - // Success Rate Outlier Detection - // First swap out the current bucket been written to, to keep data valid + // Success Rate Outlier Detection. + // First swap out the current bucket been written to, to keep data valid. host.second->updateCurrentSRBucket(); // If there are not enough hosts to begin with, don't do the work. @@ -273,7 +273,7 @@ void DetectorImpl::onIntervalTimer() { if (valid_sr_hosts.size() >= runtime_.snapshot().getInteger("outlier_detection.significant_host_threshold", config_.significantHostThreshold())) { - double ejection_threshold = srEjectionThreshold(sr_sum, sr_data); + double ejection_threshold = Utility::srEjectionThreshold(sr_sum, sr_data); for (auto host : valid_sr_hosts) { if (host.second < ejection_threshold) { stats_.ejections_sr_.inc(); @@ -340,7 +340,7 @@ std::string EventLoggerImpl::typeToString(EjectionType type) { return "SR"; } - NOT_IMPLEMENTED; + NOT_REACHED; } int EventLoggerImpl::secsSinceLastAction(const Optional& lastActionTime, diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index bd3510bb224bf..745d0a7c7b137 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -44,10 +44,8 @@ struct SRAccumulatorBucket { /** * The S(uccess)R(ate)AccumulatorImpl uses the SRAccumulatorBucket to get per host Success Rate - * stats. - * This implementation has a fixed window size of time, and thus only needs a bucket to write to, - * and - * a bucket to accumulate/run stats over. + * stats. This implementation has a fixed window size of time, and thus only needs a + * bucket to write to, and a bucket to accumulate/run stats over. */ class SRAccumulatorImpl { public: @@ -61,11 +59,9 @@ class SRAccumulatorImpl { * window of time could be dynamically adjusted. In the current implementation it is a fixed time * window. * @param rq_volume_threshold, the threshold of requests an accumulator has to have in order to be - * able to return - * a significant SR value. + * able to return a significant SR value. * @return a valid Optional with the success rate. If there were not enough requests, an - * invalid Optional - * is returned. + * invalid Optional is returned. */ Optional getSR(uint64_t rq_volume_threshold); @@ -189,7 +185,6 @@ class DetectorImpl : public Detector, public std::enable_shared_from_this& sr_data); bool enforceEjection(EjectionType type); DetectorConfig config_; @@ -201,8 +196,6 @@ class DetectorImpl : public Detector, public std::enable_shared_from_this callbacks_; std::unordered_map host_sinks_; EventLoggerPtr event_logger_; - // Factor to multiply the stdev of a cluster's Success Rate for success rate outlier ejection. - static constexpr double sr_stdev_factor_ = 1.9; }; class EventLoggerImpl : public EventLogger { @@ -223,5 +216,25 @@ class EventLoggerImpl : public EventLogger { SystemTimeSource& time_source_; }; +/** + * Utilities for Outlier Detection + */ +class Utility { +public: + /** + * This function returns the Success Rate trheshold for Success Rate outlier detection. If a + * host's + * Success Rate is under this threshold the host is an outlier. + * @param sr_sum is the sum of the data in the sr_data vector. + * @param sr_data is the vector containing the individual success rate data points. + * @return the Success Rate threshold. + */ + static double srEjectionThreshold(double sr_sum, std::vector& sr_data); + +private: + // Factor to multiply the stdev of a cluster's Success Rate for success rate outlier ejection. + static constexpr double SR_STDEV_FACTOR = 1.9; +}; + } // Outlier } // Upstream diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index 58a448af6b6fb..0e5c5f397784d 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -571,5 +571,12 @@ TEST(OutlierDetectionEventLoggerImplTest, All) { Json::Factory::LoadFromString(log4); } +TEST(OutlierUtility, SRThreshold) { + std::vector data = {50, 100, 100, 100, 100}; + double sum = std::accumulate(data.begin(), data.end(), 0.0); + + EXPECT_EQ(Utility::srEjectionThreshold(sum, data), 52); +} + } // Outlier } // Upstream From 487256a0a361086dd2b597765f0a5ab76936d14c Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Tue, 21 Mar 2017 12:04:05 -0700 Subject: [PATCH 14/29] space --- source/common/upstream/outlier_detection_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 745d0a7c7b137..35c03485a1b4c 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -51,7 +51,7 @@ class SRAccumulatorImpl { public: SRAccumulatorImpl() : current_sr_bucket_(new SRAccumulatorBucket()), - backup_sr_bucket_(new SRAccumulatorBucket()){}; + backup_sr_bucket_(new SRAccumulatorBucket()) {}; SRAccumulatorBucket* getCurrentWriter(); /** * This function returns the SR of a host over a window of time if the request volume is high From 791035538557ea7ba015eb56c7ba54e67f579af2 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Tue, 21 Mar 2017 12:18:58 -0700 Subject: [PATCH 15/29] helper method --- .../common/upstream/outlier_detection_impl.h | 2 +- .../upstream/outlier_detection_impl_test.cc | 41 ++++++++++--------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 35c03485a1b4c..745d0a7c7b137 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -51,7 +51,7 @@ class SRAccumulatorImpl { public: SRAccumulatorImpl() : current_sr_bucket_(new SRAccumulatorBucket()), - backup_sr_bucket_(new SRAccumulatorBucket()) {}; + backup_sr_bucket_(new SRAccumulatorBucket()){}; SRAccumulatorBucket* getCurrentWriter(); /** * This function returns the SR of a host over a window of time if the request volume is high diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index 0e5c5f397784d..ce1d31c6437d6 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -57,6 +57,23 @@ class OutlierDetectorImplTest : public testing::Test { .WillByDefault(Return(true)); } + void loadRq(std::vector& hosts, int num_rq) { + for (uint64_t i = 0; i < hosts.size() - 1; i++) { + for (int j = 0; j < num_rq; j++) { + hosts[i]->outlierDetector().putHttpResponseCode(200); + } + } + } + + void loadRq(HostPtr host, int num_rq, int failure_mod) { + for (int i = 0; i < num_rq; i++) { + host->outlierDetector().putHttpResponseCode(200); + if (i % failure_mod == 0) { + host->outlierDetector().putHttpResponseCode(503); + } + } + } + NiceMock cluster_; NiceMock dispatcher_; NiceMock runtime_; @@ -235,16 +252,8 @@ TEST_F(OutlierDetectorImplTest, BasicFlowSR) { {cluster_.hosts_[1], cluster_.hosts_[2], cluster_.hosts_[3], cluster_.hosts_[4]}, {}); // Cause a consecutive SR error on one host. First have 4 of the hosts have perfect SR. - for (uint64_t i = 0; i < cluster_.hosts_.size() - 1; i++) { - for (int j = 0; j < 200; j++) { - cluster_.hosts_[i]->outlierDetector().putHttpResponseCode(200); - } - } - - for (int i = 0; i < 100; i++) { - cluster_.hosts_[4]->outlierDetector().putHttpResponseCode(200); - cluster_.hosts_[4]->outlierDetector().putHttpResponseCode(503); - } + loadRq(cluster_.hosts_, 200); + loadRq(cluster_.hosts_[4], 100, 1); EXPECT_CALL(time_source_, currentSystemTime()) .Times(2) @@ -278,16 +287,8 @@ TEST_F(OutlierDetectorImplTest, BasicFlowSR) { EXPECT_EQ(0UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); // Give 4 hosts enough request volume but not to the 5th. Should not cause an ejection. - for (uint64_t i = 0; i < cluster_.hosts_.size() - 1; i++) { - for (int j = 0; j < 150; j++) { - cluster_.hosts_[i]->outlierDetector().putHttpResponseCode(200); - } - } - - for (int i = 0; i < 25; i++) { - cluster_.hosts_[4]->outlierDetector().putHttpResponseCode(200); - cluster_.hosts_[4]->outlierDetector().putHttpResponseCode(503); - } + loadRq(cluster_.hosts_, 150); + loadRq(cluster_.hosts_[4], 25, 1); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(60001)))); From 50d68e4024f7d3960da9a82b60b39b7e6dfefccf Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Tue, 21 Mar 2017 12:55:57 -0700 Subject: [PATCH 16/29] fix comment --- source/common/upstream/outlier_detection_impl.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 745d0a7c7b137..61961645052ba 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -55,9 +55,8 @@ class SRAccumulatorImpl { SRAccumulatorBucket* getCurrentWriter(); /** * This function returns the SR of a host over a window of time if the request volume is high - * enough. The underlying - * window of time could be dynamically adjusted. In the current implementation it is a fixed time - * window. + * enough. The underlying window of time could be dynamically adjusted. In the current + * implementation it is a fixed time window. * @param rq_volume_threshold, the threshold of requests an accumulator has to have in order to be * able to return a significant SR value. * @return a valid Optional with the success rate. If there were not enough requests, an From 39e75a1281f167c40453ea51151b09052a4b8a7d Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Tue, 21 Mar 2017 16:03:36 -0700 Subject: [PATCH 17/29] update comments --- .../common/upstream/outlier_detection_impl.cc | 21 ++- .../common/upstream/outlier_detection_impl.h | 14 +- .../upstream/outlier_detection_impl_test.cc | 145 +++++++----------- 3 files changed, 76 insertions(+), 104 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 3879d2de9640a..813be0c78b846 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -1,3 +1,4 @@ +#include #include "outlier_detection_impl.h" #include "envoy/event/dispatcher.h" @@ -232,6 +233,8 @@ void DetectorImpl::onConsecutive5xxWorker(HostPtr host) { ejectHost(host, EjectionType::Consecutive5xx); } +const double Utility::SR_STDEV_FACTOR = 1.9; + double Utility::srEjectionThreshold(double sr_sum, std::vector& sr_data) { double mean = sr_sum / sr_data.size(); double stdev = 0; @@ -246,8 +249,15 @@ double Utility::srEjectionThreshold(double sr_sum, std::vector& sr_data) void DetectorImpl::onIntervalTimer() { SystemTime now = time_source_.currentSystemTime(); std::unordered_map valid_sr_hosts; + uint64_t significant_host_threshold = runtime_.snapshot().getInteger( + "outlier_detection.significant_host_threshold", config_.significantHostThreshold()); + uint64_t rq_volume_threshold = runtime_.snapshot().getInteger( + "outlier_detection.rq_volume_threshold", config_.rqVolumeThreshold()); std::vector sr_data; double sr_sum = 0; + // reserve upper bound of vector size to avoid reallocation. + sr_data.reserve(host_sinks_.size()); + for (auto host : host_sinks_) { checkHostForUneject(host.first, host.second, now); @@ -256,11 +266,8 @@ void DetectorImpl::onIntervalTimer() { host.second->updateCurrentSRBucket(); // If there are not enough hosts to begin with, don't do the work. - if (host_sinks_.size() >= - runtime_.snapshot().getInteger("outlier_detection.significant_host_threshold", - config_.significantHostThreshold())) { - Optional host_sr = host.second->srAccumulator().getSR(runtime_.snapshot().getInteger( - "outlier_detection.rq_volume_threshold", config_.rqVolumeThreshold())); + if (host_sinks_.size() >= significant_host_threshold) { + Optional host_sr = host.second->srAccumulator().getSR(rq_volume_threshold); if (host_sr.valid()) { valid_sr_hosts[host.first] = host_sr.value(); @@ -270,9 +277,7 @@ void DetectorImpl::onIntervalTimer() { } } - if (valid_sr_hosts.size() >= - runtime_.snapshot().getInteger("outlier_detection.significant_host_threshold", - config_.significantHostThreshold())) { + if (valid_sr_hosts.size() >= significant_host_threshold) { double ejection_threshold = Utility::srEjectionThreshold(sr_sum, sr_data); for (auto host : valid_sr_hosts) { if (host.second < ejection_threshold) { diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 61961645052ba..4ce2590076b8f 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -51,16 +51,17 @@ class SRAccumulatorImpl { public: SRAccumulatorImpl() : current_sr_bucket_(new SRAccumulatorBucket()), - backup_sr_bucket_(new SRAccumulatorBucket()){}; + backup_sr_bucket_(new SRAccumulatorBucket()) {} + SRAccumulatorBucket* getCurrentWriter(); /** * This function returns the SR of a host over a window of time if the request volume is high * enough. The underlying window of time could be dynamically adjusted. In the current * implementation it is a fixed time window. - * @param rq_volume_threshold, the threshold of requests an accumulator has to have in order to be - * able to return a significant SR value. + * @param rq_volume_threshold the threshold of requests an accumulator has to have in order to be + * able to return a significant SR value. * @return a valid Optional with the success rate. If there were not enough requests, an - * invalid Optional is returned. + * invalid Optional is returned. */ Optional getSR(uint64_t rq_volume_threshold); @@ -222,8 +223,7 @@ class Utility { public: /** * This function returns the Success Rate trheshold for Success Rate outlier detection. If a - * host's - * Success Rate is under this threshold the host is an outlier. + * host's Success Rate is under this threshold the host is an outlier. * @param sr_sum is the sum of the data in the sr_data vector. * @param sr_data is the vector containing the individual success rate data points. * @return the Success Rate threshold. @@ -232,7 +232,7 @@ class Utility { private: // Factor to multiply the stdev of a cluster's Success Rate for success rate outlier ejection. - static constexpr double SR_STDEV_FACTOR = 1.9; + static const double SR_STDEV_FACTOR; }; } // Outlier diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index ce1d31c6437d6..31b7a19b276e4 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -57,18 +57,23 @@ class OutlierDetectorImplTest : public testing::Test { .WillByDefault(Return(true)); } - void loadRq(std::vector& hosts, int num_rq) { - for (uint64_t i = 0; i < hosts.size() - 1; i++) { + void loadRq(std::vector& hosts, int num_rq, bool success) { + for (uint64_t i = 0; i < hosts.size(); i++) { for (int j = 0; j < num_rq; j++) { - hosts[i]->outlierDetector().putHttpResponseCode(200); + if (success) { + hosts[i]->outlierDetector().putHttpResponseCode(200); + } else { + hosts[i]->outlierDetector().putHttpResponseCode(503); + } } } } - void loadRq(HostPtr host, int num_rq, int failure_mod) { + void loadRq(HostPtr host, int num_rq, bool success) { for (int i = 0; i < num_rq; i++) { - host->outlierDetector().putHttpResponseCode(200); - if (i % failure_mod == 0) { + if (success) { + host->outlierDetector().putHttpResponseCode(200); + } else { host->outlierDetector().putHttpResponseCode(503); } } @@ -118,10 +123,7 @@ TEST_F(OutlierDetectorImplTest, DestroyWithActive) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, false); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -129,7 +131,7 @@ TEST_F(OutlierDetectorImplTest, DestroyWithActive) { EXPECT_CALL(*event_logger_, logEject(std::static_pointer_cast(cluster_.hosts_[0]), EjectionType::Consecutive5xx)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, false); EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); @@ -150,11 +152,7 @@ TEST_F(OutlierDetectorImplTest, DestroyHostInUse) { detector.reset(); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 5, false); } TEST_F(OutlierDetectorImplTest, BasicFlow5xx) { @@ -171,13 +169,10 @@ TEST_F(OutlierDetectorImplTest, BasicFlow5xx) { cluster_.runCallbacks({cluster_.hosts_[1]}, {}); // Cause a consecutive 5xx error. - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(200); + loadRq(cluster_.hosts_[0], 1, false); + loadRq(cluster_.hosts_[0], 1, true); cluster_.hosts_[0]->outlierDetector().putResponseTime(std::chrono::milliseconds(5)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, false); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -185,7 +180,7 @@ TEST_F(OutlierDetectorImplTest, BasicFlow5xx) { EXPECT_CALL(*event_logger_, logEject(std::static_pointer_cast(cluster_.hosts_[0]), EjectionType::Consecutive5xx)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, false); EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); @@ -207,13 +202,10 @@ TEST_F(OutlierDetectorImplTest, BasicFlow5xx) { EXPECT_FALSE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); // Eject host again to cause an ejection after an unejection has taken place - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(200); + loadRq(cluster_.hosts_[0], 1, false); + loadRq(cluster_.hosts_[0], 1, true); cluster_.hosts_[0]->outlierDetector().putResponseTime(std::chrono::milliseconds(5)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, false); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(40000)))); @@ -221,7 +213,7 @@ TEST_F(OutlierDetectorImplTest, BasicFlow5xx) { EXPECT_CALL(*event_logger_, logEject(std::static_pointer_cast(cluster_.hosts_[0]), EjectionType::Consecutive5xx)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, false); EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); @@ -235,25 +227,29 @@ TEST_F(OutlierDetectorImplTest, BasicFlow5xx) { TEST_F(OutlierDetectorImplTest, BasicFlowSR) { EXPECT_CALL(cluster_, addMemberUpdateCb(_)); - cluster_.hosts_ = {HostPtr{new HostImpl( - cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; + cluster_.hosts_ = { + HostPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), + false, 1, "")}, + HostPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:81"), + false, 1, "")}, + HostPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:82"), + false, 1, "")}, + HostPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:83"), + false, 1, "")}, + HostPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:84"), + false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); - for (int i = 81; i < 85; ++i) { - cluster_.hosts_.push_back(HostPtr{new HostImpl( - cluster_.info_, "", Network::Utility::resolveUrl(fmt::format("tcp://127.0.0.1:{}", i)), - false, 1, "")}); - } - - cluster_.runCallbacks( - {cluster_.hosts_[1], cluster_.hosts_[2], cluster_.hosts_[3], cluster_.hosts_[4]}, {}); + // Turn off 5xx detection to test SR detection in isolation. + ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_consecutive_5xx", 100)) + .WillByDefault(Return(false)); // Cause a consecutive SR error on one host. First have 4 of the hosts have perfect SR. - loadRq(cluster_.hosts_, 200); - loadRq(cluster_.hosts_[4], 100, 1); + loadRq(cluster_.hosts_, 200, true); + loadRq(cluster_.hosts_[4], 200, false); EXPECT_CALL(time_source_, currentSystemTime()) .Times(2) @@ -287,8 +283,8 @@ TEST_F(OutlierDetectorImplTest, BasicFlowSR) { EXPECT_EQ(0UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); // Give 4 hosts enough request volume but not to the 5th. Should not cause an ejection. - loadRq(cluster_.hosts_, 150); - loadRq(cluster_.hosts_[4], 25, 1); + loadRq(cluster_.hosts_, 25, true); + loadRq(cluster_.hosts_[4], 25, false); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(60001)))); @@ -306,10 +302,7 @@ TEST_F(OutlierDetectorImplTest, RemoveWhileEjected) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, false); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -317,7 +310,7 @@ TEST_F(OutlierDetectorImplTest, RemoveWhileEjected) { EXPECT_CALL(*event_logger_, logEject(std::static_pointer_cast(cluster_.hosts_[0]), EjectionType::Consecutive5xx)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, false); EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); @@ -348,10 +341,7 @@ TEST_F(OutlierDetectorImplTest, Overflow) { ON_CALL(runtime_.snapshot_, getInteger("outlier_detection.max_ejection_percent", _)) .WillByDefault(Return(1)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, false); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -362,11 +352,7 @@ TEST_F(OutlierDetectorImplTest, Overflow) { cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); - cluster_.hosts_[1]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[1]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[1]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[1]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[1]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[1], 5, false); EXPECT_FALSE(cluster_.hosts_[1]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); @@ -383,14 +369,11 @@ TEST_F(OutlierDetectorImplTest, NotEnforcing) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, false); ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_consecutive_5xx", 100)) .WillByDefault(Return(false)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, false); EXPECT_FALSE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(0UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); @@ -408,14 +391,11 @@ TEST_F(OutlierDetectorImplTest, CrossThreadRemoveRace) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, false); Event::PostCb post_cb; EXPECT_CALL(dispatcher_, post(_)).WillOnce(SaveArg<0>(&post_cb)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, false); // Remove before the cross thread event comes in. std::vector old_hosts = std::move(cluster_.hosts_); @@ -434,14 +414,11 @@ TEST_F(OutlierDetectorImplTest, CrossThreadDestroyRace) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, false); Event::PostCb post_cb; EXPECT_CALL(dispatcher_, post(_)).WillOnce(SaveArg<0>(&post_cb)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, false); // Destroy before the cross thread event comes in. std::weak_ptr weak_detector = detector; @@ -461,14 +438,11 @@ TEST_F(OutlierDetectorImplTest, CrossThreadFailRace) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, false); Event::PostCb post_cb; EXPECT_CALL(dispatcher_, post(_)).WillOnce(SaveArg<0>(&post_cb)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, false); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -495,10 +469,7 @@ TEST_F(OutlierDetectorImplTest, Consecutive5xxAlreadyEjected) { detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); // Cause a consecutive 5xx error. - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, false); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -506,16 +477,12 @@ TEST_F(OutlierDetectorImplTest, Consecutive5xxAlreadyEjected) { EXPECT_CALL(*event_logger_, logEject(std::static_pointer_cast(cluster_.hosts_[0]), EjectionType::Consecutive5xx)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, false); EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); // Cause another consecutive 5xx error. - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(200); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, true); + loadRq(cluster_.hosts_[0], 5, false); } TEST(OutlierDetectionEventLoggerImplTest, All) { From c2b00d058279c1e39107e96d758af3a0f0133e9a Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Tue, 21 Mar 2017 17:24:27 -0700 Subject: [PATCH 18/29] update --- .../common/upstream/outlier_detection_impl.cc | 7 +- .../upstream/outlier_detection_impl_test.cc | 88 +++++++++---------- 2 files changed, 48 insertions(+), 47 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 813be0c78b846..f963674a432cb 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -255,8 +255,11 @@ void DetectorImpl::onIntervalTimer() { "outlier_detection.rq_volume_threshold", config_.rqVolumeThreshold()); std::vector sr_data; double sr_sum = 0; - // reserve upper bound of vector size to avoid reallocation. - sr_data.reserve(host_sinks_.size()); + + if (host_sinks_.size() >= significant_host_threshold) { + // reserve upper bound of vector size to avoid reallocation. + sr_data.reserve(host_sinks_.size()); + } for (auto host : host_sinks_) { checkHostForUneject(host.first, host.second, now); diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index 31b7a19b276e4..b4cdf57065b8a 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -57,25 +57,15 @@ class OutlierDetectorImplTest : public testing::Test { .WillByDefault(Return(true)); } - void loadRq(std::vector& hosts, int num_rq, bool success) { + void loadRq(std::vector& hosts, int num_rq, int http_code) { for (uint64_t i = 0; i < hosts.size(); i++) { - for (int j = 0; j < num_rq; j++) { - if (success) { - hosts[i]->outlierDetector().putHttpResponseCode(200); - } else { - hosts[i]->outlierDetector().putHttpResponseCode(503); - } - } + loadRq(hosts[i], num_rq, http_code); } } - void loadRq(HostPtr host, int num_rq, bool success) { + void loadRq(HostPtr host, int num_rq, int http_code) { for (int i = 0; i < num_rq; i++) { - if (success) { - host->outlierDetector().putHttpResponseCode(200); - } else { - host->outlierDetector().putHttpResponseCode(503); - } + host->outlierDetector().putHttpResponseCode(http_code); } } @@ -123,7 +113,7 @@ TEST_F(OutlierDetectorImplTest, DestroyWithActive) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); - loadRq(cluster_.hosts_[0], 4, false); + loadRq(cluster_.hosts_[0], 4, 503); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -131,7 +121,7 @@ TEST_F(OutlierDetectorImplTest, DestroyWithActive) { EXPECT_CALL(*event_logger_, logEject(std::static_pointer_cast(cluster_.hosts_[0]), EjectionType::Consecutive5xx)); - loadRq(cluster_.hosts_[0], 1, false); + loadRq(cluster_.hosts_[0], 1, 503); EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); @@ -152,7 +142,7 @@ TEST_F(OutlierDetectorImplTest, DestroyHostInUse) { detector.reset(); - loadRq(cluster_.hosts_[0], 5, false); + loadRq(cluster_.hosts_[0], 5, 503); } TEST_F(OutlierDetectorImplTest, BasicFlow5xx) { @@ -169,10 +159,10 @@ TEST_F(OutlierDetectorImplTest, BasicFlow5xx) { cluster_.runCallbacks({cluster_.hosts_[1]}, {}); // Cause a consecutive 5xx error. - loadRq(cluster_.hosts_[0], 1, false); - loadRq(cluster_.hosts_[0], 1, true); + loadRq(cluster_.hosts_[0], 1, 503); + loadRq(cluster_.hosts_[0], 1, 200); cluster_.hosts_[0]->outlierDetector().putResponseTime(std::chrono::milliseconds(5)); - loadRq(cluster_.hosts_[0], 4, false); + loadRq(cluster_.hosts_[0], 4, 503); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -180,7 +170,7 @@ TEST_F(OutlierDetectorImplTest, BasicFlow5xx) { EXPECT_CALL(*event_logger_, logEject(std::static_pointer_cast(cluster_.hosts_[0]), EjectionType::Consecutive5xx)); - loadRq(cluster_.hosts_[0], 1, false); + loadRq(cluster_.hosts_[0], 1, 503); EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); @@ -202,10 +192,10 @@ TEST_F(OutlierDetectorImplTest, BasicFlow5xx) { EXPECT_FALSE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); // Eject host again to cause an ejection after an unejection has taken place - loadRq(cluster_.hosts_[0], 1, false); - loadRq(cluster_.hosts_[0], 1, true); + loadRq(cluster_.hosts_[0], 1, 503); + loadRq(cluster_.hosts_[0], 1, 200); cluster_.hosts_[0]->outlierDetector().putResponseTime(std::chrono::milliseconds(5)); - loadRq(cluster_.hosts_[0], 4, false); + loadRq(cluster_.hosts_[0], 4, 503); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(40000)))); @@ -213,7 +203,7 @@ TEST_F(OutlierDetectorImplTest, BasicFlow5xx) { EXPECT_CALL(*event_logger_, logEject(std::static_pointer_cast(cluster_.hosts_[0]), EjectionType::Consecutive5xx)); - loadRq(cluster_.hosts_[0], 1, false); + loadRq(cluster_.hosts_[0], 1, 503); EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); @@ -248,8 +238,8 @@ TEST_F(OutlierDetectorImplTest, BasicFlowSR) { .WillByDefault(Return(false)); // Cause a consecutive SR error on one host. First have 4 of the hosts have perfect SR. - loadRq(cluster_.hosts_, 200, true); - loadRq(cluster_.hosts_[4], 200, false); + loadRq(cluster_.hosts_, 200, 200); + loadRq(cluster_.hosts_[4], 200, 503); EXPECT_CALL(time_source_, currentSystemTime()) .Times(2) @@ -283,8 +273,8 @@ TEST_F(OutlierDetectorImplTest, BasicFlowSR) { EXPECT_EQ(0UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); // Give 4 hosts enough request volume but not to the 5th. Should not cause an ejection. - loadRq(cluster_.hosts_, 25, true); - loadRq(cluster_.hosts_[4], 25, false); + loadRq(cluster_.hosts_, 25, 200); + loadRq(cluster_.hosts_[4], 25, 503); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(60001)))); @@ -302,7 +292,7 @@ TEST_F(OutlierDetectorImplTest, RemoveWhileEjected) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); - loadRq(cluster_.hosts_[0], 4, false); + loadRq(cluster_.hosts_[0], 4, 503); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -310,7 +300,7 @@ TEST_F(OutlierDetectorImplTest, RemoveWhileEjected) { EXPECT_CALL(*event_logger_, logEject(std::static_pointer_cast(cluster_.hosts_[0]), EjectionType::Consecutive5xx)); - loadRq(cluster_.hosts_[0], 1, false); + loadRq(cluster_.hosts_[0], 1, 503); EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); @@ -341,7 +331,7 @@ TEST_F(OutlierDetectorImplTest, Overflow) { ON_CALL(runtime_.snapshot_, getInteger("outlier_detection.max_ejection_percent", _)) .WillByDefault(Return(1)); - loadRq(cluster_.hosts_[0], 4, false); + loadRq(cluster_.hosts_[0], 4, 503); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -352,7 +342,7 @@ TEST_F(OutlierDetectorImplTest, Overflow) { cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); - loadRq(cluster_.hosts_[1], 5, false); + loadRq(cluster_.hosts_[1], 5, 503); EXPECT_FALSE(cluster_.hosts_[1]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); @@ -369,11 +359,11 @@ TEST_F(OutlierDetectorImplTest, NotEnforcing) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); - loadRq(cluster_.hosts_[0], 4, false); + loadRq(cluster_.hosts_[0], 4, 503); ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_consecutive_5xx", 100)) .WillByDefault(Return(false)); - loadRq(cluster_.hosts_[0], 1, false); + loadRq(cluster_.hosts_[0], 1, 503); EXPECT_FALSE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(0UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); @@ -391,11 +381,11 @@ TEST_F(OutlierDetectorImplTest, CrossThreadRemoveRace) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); - loadRq(cluster_.hosts_[0], 4, false); + loadRq(cluster_.hosts_[0], 4, 503); Event::PostCb post_cb; EXPECT_CALL(dispatcher_, post(_)).WillOnce(SaveArg<0>(&post_cb)); - loadRq(cluster_.hosts_[0], 1, false); + loadRq(cluster_.hosts_[0], 1, 503); // Remove before the cross thread event comes in. std::vector old_hosts = std::move(cluster_.hosts_); @@ -414,11 +404,11 @@ TEST_F(OutlierDetectorImplTest, CrossThreadDestroyRace) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); - loadRq(cluster_.hosts_[0], 4, false); + loadRq(cluster_.hosts_[0], 4, 503); Event::PostCb post_cb; EXPECT_CALL(dispatcher_, post(_)).WillOnce(SaveArg<0>(&post_cb)); - loadRq(cluster_.hosts_[0], 1, false); + loadRq(cluster_.hosts_[0], 1, 503); // Destroy before the cross thread event comes in. std::weak_ptr weak_detector = detector; @@ -438,11 +428,11 @@ TEST_F(OutlierDetectorImplTest, CrossThreadFailRace) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); - loadRq(cluster_.hosts_[0], 4, false); + loadRq(cluster_.hosts_[0], 4, 503); Event::PostCb post_cb; EXPECT_CALL(dispatcher_, post(_)).WillOnce(SaveArg<0>(&post_cb)); - loadRq(cluster_.hosts_[0], 1, false); + loadRq(cluster_.hosts_[0], 1, 503); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -469,7 +459,7 @@ TEST_F(OutlierDetectorImplTest, Consecutive5xxAlreadyEjected) { detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); // Cause a consecutive 5xx error. - loadRq(cluster_.hosts_[0], 4, false); + loadRq(cluster_.hosts_[0], 4, 503); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -477,12 +467,20 @@ TEST_F(OutlierDetectorImplTest, Consecutive5xxAlreadyEjected) { EXPECT_CALL(*event_logger_, logEject(std::static_pointer_cast(cluster_.hosts_[0]), EjectionType::Consecutive5xx)); - loadRq(cluster_.hosts_[0], 1, false); + loadRq(cluster_.hosts_[0], 1, 503); EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); // Cause another consecutive 5xx error. - loadRq(cluster_.hosts_[0], 1, true); - loadRq(cluster_.hosts_[0], 5, false); + loadRq(cluster_.hosts_[0], 1, 200); + loadRq(cluster_.hosts_[0], 5, 503); +} + +TEST(DetectorHostSinkNullImplTest, All) { + DetectorHostSinkNullImpl null_sink; + + EXPECT_EQ(0UL, null_sink.numEjections()); + EXPECT_FALSE(null_sink.lastEjectionTime().valid()); + EXPECT_FALSE(null_sink.lastUnejectionTime().valid()); } TEST(OutlierDetectionEventLoggerImplTest, All) { From 19aa4c3d54bb2ef0e8b8dc2c1f99a36c833697b3 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Wed, 22 Mar 2017 14:38:52 -0700 Subject: [PATCH 19/29] address most comments --- .../configuration/cluster_manager/cluster.rst | 12 +- .../cluster_manager/cluster_runtime.rst | 18 +-- docs/intro/arch_overview/outlier.rst | 8 +- .../common/upstream/outlier_detection_impl.cc | 138 ++++++++++-------- .../common/upstream/outlier_detection_impl.h | 77 +++++----- .../upstream/outlier_detection_impl_test.cc | 12 +- 6 files changed, 146 insertions(+), 119 deletions(-) diff --git a/docs/configuration/cluster_manager/cluster.rst b/docs/configuration/cluster_manager/cluster.rst index 308f9b9c65908..cb78a199b917a 100644 --- a/docs/configuration/cluster_manager/cluster.rst +++ b/docs/configuration/cluster_manager/cluster.rst @@ -174,23 +174,23 @@ outlier_detection The % chance that a host will be actually ejected when an outlier status is detected through consecutive 5xx. This setting can be used to disable ejection or to ramp it up slowly. Defaults to 100. - .. _config_cluster_manager_cluster_outlier_detection_enforcing_sr: + .. _config_cluster_manager_cluster_outlier_detection_enforcing_success_rate: - enforcing_sr + enforcing_success_rate The % chance that a host will be actually ejected when an outlier status is detected through success rate statistics. This setting can be used to disable ejection or to ramp it up slowly. Defaults to 100. - .. _config_cluster_manager_cluster_outlier_detection_significant_host_threshold: + .. _config_cluster_manager_cluster_outlier_detection_success_rate_minimum_hosts: - significant_host_threshold + success_rate_minimum_hosts The number of hosts in a cluster that must have enough request volume to detect success rate outliers. If the number of hosts is less than this setting, outlier detection via success rate statistics is not performed for any host in the cluster. Defaults to 5. - .. _config_cluster_manager_cluster_outlier_detection_rq_volume_threshold: + .. _config_cluster_manager_cluster_outlier_detection_success_rate_request_volume: - rq_volume_threshold: + success_rate_request_volume The minimum number of total requests that must be collected in one interval (as defined by :ref:`interval_ms ` above) to include this host in success rate based outlier detection. If the volume is lower than this setting, diff --git a/docs/configuration/cluster_manager/cluster_runtime.rst b/docs/configuration/cluster_manager/cluster_runtime.rst index ae526eec0a200..d284d3b935d17 100644 --- a/docs/configuration/cluster_manager/cluster_runtime.rst +++ b/docs/configuration/cluster_manager/cluster_runtime.rst @@ -57,19 +57,19 @@ outlier_detection.enforcing_consecutive_5xx ` setting in outlier detection -outlier_detection.enforcing_sr - :ref:`enforcing_sr - ` +outlier_detection.enforcing_success_rate + :ref:`enforcing_success_rate + ` setting in outlier detection -outlier_detection.significant_host_threshold - :ref:`significant_host_threshold - ` +outlier_detection.success_rate_minimum_hosts + :ref:`success_rate_minimum_hosts + ` setting in outlier detection -outlier_detection.rq_volume_threshold - :ref:`rq_volume_threshold - ` +outlier_detection.success_rate_request_volume + :ref:`success_rate_request_volume + ` setting in outlier detection Core diff --git a/docs/intro/arch_overview/outlier.rst b/docs/intro/arch_overview/outlier.rst index 1e8e65f0614d2..160f1ffb0b76b 100644 --- a/docs/intro/arch_overview/outlier.rst +++ b/docs/intro/arch_overview/outlier.rst @@ -51,13 +51,13 @@ required for ejection is controlled by the :ref:`outlier_detection.consecutive_5 Success Rate ^^^^^^^^^^^^ -Success Rate based outlier ejection aggregates success rate data from every host in a cluster, and at a given -interval ejects hosts based on statistical outlier detection. Success Rate outlier ejection will not be +Success Rate based outlier ejection aggregates success rate data from every host in a cluster. Then at given +intervals ejects hosts based on statistical outlier detection. Success Rate outlier ejection will not be calculated for a host if its request volume over the aggregation interval is less than the -:ref:`outlier_detection.rq_volume_threshold` +:ref:`outlier_detection.success_rate_request_volume` value. Moreover, detection will not be performed for a cluster if the number of hosts with the minimum required request volume in an interval is less than the -:ref:`outlier_detection.significant_host_threshold` +:ref:`outlier_detection.success_rate_minimum_hosts` value. Ejection event logging diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index f963674a432cb..bedc7d1fb08a2 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -1,4 +1,4 @@ -#include +#include #include "outlier_detection_impl.h" #include "envoy/event/dispatcher.h" @@ -34,12 +34,12 @@ void DetectorHostSinkImpl::uneject(SystemTime unejection_time) { last_unejection_time_.value(unejection_time); } -void DetectorHostSinkImpl::updateCurrentSRBucket() { - sr_accumulator_bucket_.store(sr_accumulator_.getCurrentWriter()); +void DetectorHostSinkImpl::updateCurrentSuccessRateBucket() { + success_rate_accumulator_bucket_.store(success_rate_accumulator_.updateCurrentWriter()); } void DetectorHostSinkImpl::putHttpResponseCode(uint64_t response_code) { - sr_accumulator_bucket_.load()->total_rq_counter_++; + success_rate_accumulator_bucket_.load()->total_request_counter_++; if (Http::CodeUtility::is5xx(response_code)) { std::shared_ptr detector = detector_.lock(); if (!detector) { @@ -53,7 +53,7 @@ void DetectorHostSinkImpl::putHttpResponseCode(uint64_t response_code) { detector->onConsecutive5xx(host_.lock()); } } else { - sr_accumulator_bucket_.load()->success_rq_counter_++; + success_rate_accumulator_bucket_.load()->success_request_counter_++; consecutive_5xx_ = 0; } } @@ -65,13 +65,14 @@ DetectorConfig::DetectorConfig(const Json::Object& json_config) consecutive_5xx_(static_cast(json_config.getInteger("consecutive_5xx", 5))), max_ejection_percent_( static_cast(json_config.getInteger("max_ejection_percent", 10))), - significant_host_threshold_( - static_cast(json_config.getInteger("significant_host_threshold", 5))), - rq_volume_threshold_( - static_cast(json_config.getInteger("rq_volume_threshold", 100))), + success_rate_minimum_hosts_( + static_cast(json_config.getInteger("success_rate_minimum_hosts", 5))), + success_rate_request_volume_( + static_cast(json_config.getInteger("success_rate_request_volume", 100))), enforcing_consecutive_5xx_( static_cast(json_config.getInteger("enforcing_consecutive_5xx", 100))), - enforcing_sr_(static_cast(json_config.getInteger("enforcing_sr", 100))) {} + enforcing_success_rate_( + static_cast(json_config.getInteger("enforcing_success_rate", 100))) {} DetectorImpl::DetectorImpl(const Cluster& cluster, const Json::Object& json_config, Event::Dispatcher& dispatcher, Runtime::Loader& runtime, @@ -164,8 +165,8 @@ bool DetectorImpl::enforceEjection(EjectionType type) { return runtime_.snapshot().featureEnabled("outlier_detection.enforcing_consecutive_5xx", config_.enforcingConsecutive5xx()); case EjectionType::SuccessRate: - return runtime_.snapshot().featureEnabled("outlier_detection.enforcing_sr", - config_.enforcingSR()); + return runtime_.snapshot().featureEnabled("outlier_detection.enforcing_success_rate", + config_.enforcingSuccessRate()); } NOT_REACHED; @@ -233,58 +234,79 @@ void DetectorImpl::onConsecutive5xxWorker(HostPtr host) { ejectHost(host, EjectionType::Consecutive5xx); } -const double Utility::SR_STDEV_FACTOR = 1.9; - -double Utility::srEjectionThreshold(double sr_sum, std::vector& sr_data) { - double mean = sr_sum / sr_data.size(); - double stdev = 0; - std::for_each(sr_data.begin(), sr_data.end(), - [&stdev, mean](double& v) { stdev += std::pow(v - mean, 2); }); - stdev /= sr_data.size(); - stdev = std::sqrt(stdev); - - return mean - (SR_STDEV_FACTOR * stdev); +const double Utility::SUCCESS_RATE_STDEV_FACTOR = 1.9; + +double Utility::successRateEjectionThreshold(double success_rate_sum, + std::vector& success_rate_data) { + // This function is using mean and standard deviation as statistical measures for outlier + // detection. First the mean is calculated by dividing the sum of success rate data over the + // number of data points. Then variance is calculated by taking the mean of the + // squared difference of data points to the mean of the data. Then standard deviation is + // calculated by taking the square root of the variance. Then the outlier threshold is + // calculated as the difference between the mean and the product of the standard + // deviation and a constant factor. + // + // For example with a data set that looks like success_rate_data = {50, 100, 100, 100, 100} the + // math would work as follows: + // success_rate_sum = 450 + // mean = 90 + // variance = 400 + // stdev = 20 + // threshold returned = 52 + double mean = success_rate_sum / success_rate_data.size(); + double variance = 0; + std::for_each(success_rate_data.begin(), success_rate_data.end(), + [&variance, mean](double& v) { variance += std::pow(v - mean, 2); }); + variance /= success_rate_data.size(); + double stdev = std::sqrt(variance); + + return mean - (SUCCESS_RATE_STDEV_FACTOR * stdev); } void DetectorImpl::onIntervalTimer() { SystemTime now = time_source_.currentSystemTime(); - std::unordered_map valid_sr_hosts; - uint64_t significant_host_threshold = runtime_.snapshot().getInteger( - "outlier_detection.significant_host_threshold", config_.significantHostThreshold()); - uint64_t rq_volume_threshold = runtime_.snapshot().getInteger( - "outlier_detection.rq_volume_threshold", config_.rqVolumeThreshold()); - std::vector sr_data; - double sr_sum = 0; - - if (host_sinks_.size() >= significant_host_threshold) { + + // data for success rate outlier ejection + std::unordered_map valid_success_rate_hosts; + uint64_t success_rate_minimum_hosts = runtime_.snapshot().getInteger( + "outlier_detection.success_rate_minimum_hosts", config_.successRateMinimumHosts()); + uint64_t success_rate_request_volume = runtime_.snapshot().getInteger( + "outlier_detection.success_rate_request_volume", config_.successRateRequestVolume()); + std::vector success_rate_data; + double success_rate_sum = 0; + + if (host_sinks_.size() >= success_rate_minimum_hosts) { // reserve upper bound of vector size to avoid reallocation. - sr_data.reserve(host_sinks_.size()); + success_rate_data.reserve(host_sinks_.size()); } for (auto host : host_sinks_) { checkHostForUneject(host.first, host.second, now); - // Success Rate Outlier Detection. - // First swap out the current bucket been written to, to keep data valid. - host.second->updateCurrentSRBucket(); + host.second->updateCurrentSuccessRateBucket(); // If there are not enough hosts to begin with, don't do the work. - if (host_sinks_.size() >= significant_host_threshold) { - Optional host_sr = host.second->srAccumulator().getSR(rq_volume_threshold); - - if (host_sr.valid()) { - valid_sr_hosts[host.first] = host_sr.value(); - sr_data.emplace_back(host_sr.value()); - sr_sum += host_sr.value(); + if (host_sinks_.size() >= success_rate_minimum_hosts) { + // Don't do work if host is already ejected. + if (!host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) { + Optional host_success_rate = + host.second->successRateAccumulator().getSuccessRate(success_rate_request_volume); + + if (host_success_rate.valid()) { + valid_success_rate_hosts[host.first] = host_success_rate.value(); + success_rate_data.emplace_back(host_success_rate.value()); + success_rate_sum += host_success_rate.value(); + } } } } - if (valid_sr_hosts.size() >= significant_host_threshold) { - double ejection_threshold = Utility::srEjectionThreshold(sr_sum, sr_data); - for (auto host : valid_sr_hosts) { + if (valid_success_rate_hosts.size() >= success_rate_minimum_hosts) { + double ejection_threshold = + Utility::successRateEjectionThreshold(success_rate_sum, success_rate_data); + for (auto host : valid_success_rate_hosts) { if (host.second < ejection_threshold) { - stats_.ejections_sr_.inc(); + stats_.ejections_success_rate_.inc(); ejectHost(host.first, EjectionType::SuccessRate); } } @@ -345,7 +367,7 @@ std::string EventLoggerImpl::typeToString(EjectionType type) { case EjectionType::Consecutive5xx: return "5xx"; case EjectionType::SuccessRate: - return "SR"; + return "SuccessRate"; } NOT_REACHED; @@ -359,23 +381,23 @@ int EventLoggerImpl::secsSinceLastAction(const Optional& lastActionT return -1; } -SRAccumulatorBucket* SRAccumulatorImpl::getCurrentWriter() { - // Right now current_ is being written to and backup_ is not. Flush the backup and swap. - backup_sr_bucket_->success_rq_counter_ = 0; - backup_sr_bucket_->total_rq_counter_ = 0; +SuccessRateAccumulatorBucket* SuccessRateAccumulator::updateCurrentWriter() { + // Right now current is being written to and backup is not. Flush the backup and swap. + backup_success_rate_bucket_->success_request_counter_ = 0; + backup_success_rate_bucket_->total_request_counter_ = 0; - current_sr_bucket_.swap(backup_sr_bucket_); + current_success_rate_bucket_.swap(backup_success_rate_bucket_); - return current_sr_bucket_.get(); + return current_success_rate_bucket_.get(); } -Optional SRAccumulatorImpl::getSR(uint64_t rq_volume_threshold) { - if (backup_sr_bucket_->total_rq_counter_ < rq_volume_threshold) { +Optional SuccessRateAccumulator::getSuccessRate(uint64_t success_rate_request_volume) { + if (backup_success_rate_bucket_->total_request_counter_ < success_rate_request_volume) { return Optional(); } - return Optional(backup_sr_bucket_->success_rq_counter_ * 100 / - backup_sr_bucket_->total_rq_counter_); + return Optional(backup_success_rate_bucket_->success_request_counter_ * 100 / + backup_success_rate_bucket_->total_request_counter_); } } // Outlier diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 4ce2590076b8f..5d125537323eb 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -37,37 +37,41 @@ class DetectorImplFactory { EventLoggerPtr event_logger); }; -struct SRAccumulatorBucket { - std::atomic success_rq_counter_; - std::atomic total_rq_counter_; +struct SuccessRateAccumulatorBucket { + std::atomic success_request_counter_; + std::atomic total_request_counter_; }; /** - * The S(uccess)R(ate)AccumulatorImpl uses the SRAccumulatorBucket to get per host Success Rate + * The SuccessRateAccumulator uses the SuccessRateAccumulatorBucket to get per host success rate * stats. This implementation has a fixed window size of time, and thus only needs a * bucket to write to, and a bucket to accumulate/run stats over. */ -class SRAccumulatorImpl { +class SuccessRateAccumulator { public: - SRAccumulatorImpl() - : current_sr_bucket_(new SRAccumulatorBucket()), - backup_sr_bucket_(new SRAccumulatorBucket()) {} + SuccessRateAccumulator() + : current_success_rate_bucket_(new SuccessRateAccumulatorBucket()), + backup_success_rate_bucket_(new SuccessRateAccumulatorBucket()) {} - SRAccumulatorBucket* getCurrentWriter(); /** - * This function returns the SR of a host over a window of time if the request volume is high - * enough. The underlying window of time could be dynamically adjusted. In the current + * This function updates the bucket to write data to. + * @return a pointer to the SuccessRateAccumulatorBucket. + */ + SuccessRateAccumulatorBucket* updateCurrentWriter(); + /** + * This function returns the success rate of a host over a window of time if the request volume is + * high enough. The underlying window of time could be dynamically adjusted. In the current * implementation it is a fixed time window. - * @param rq_volume_threshold the threshold of requests an accumulator has to have in order to be - * able to return a significant SR value. + * @param request_volume_threshold the threshold of requests an accumulator has to have in order + * to be able to return a significant success rate value. * @return a valid Optional with the success rate. If there were not enough requests, an * invalid Optional is returned. */ - Optional getSR(uint64_t rq_volume_threshold); + Optional getSuccessRate(uint64_t success_rate_request_volume); private: - std::unique_ptr current_sr_bucket_; - std::unique_ptr backup_sr_bucket_; + std::unique_ptr current_success_rate_bucket_; + std::unique_ptr backup_success_rate_bucket_; }; class DetectorImpl; @@ -80,13 +84,13 @@ class DetectorHostSinkImpl : public DetectorHostSink { DetectorHostSinkImpl(std::shared_ptr detector, HostPtr host) : detector_(detector), host_(host) { // Point the sr_accumulator_bucket_ pointer to a bucket. - updateCurrentSRBucket(); + updateCurrentSuccessRateBucket(); } void eject(SystemTime ejection_time); void uneject(SystemTime ejection_time); - void updateCurrentSRBucket(); - SRAccumulatorImpl& srAccumulator() { return sr_accumulator_; }; + void updateCurrentSuccessRateBucket(); + SuccessRateAccumulator& successRateAccumulator() { return success_rate_accumulator_; }; // Upstream::Outlier::DetectorHostSink uint32_t numEjections() override { return num_ejections_; } @@ -102,8 +106,8 @@ class DetectorHostSinkImpl : public DetectorHostSink { Optional last_ejection_time_; Optional last_unejection_time_; uint32_t num_ejections_{}; - SRAccumulatorImpl sr_accumulator_; - std::atomic sr_accumulator_bucket_; + SuccessRateAccumulator success_rate_accumulator_; + std::atomic success_rate_accumulator_bucket_; }; /** @@ -115,7 +119,7 @@ class DetectorHostSinkImpl : public DetectorHostSink { GAUGE (ejections_active) \ COUNTER(ejections_overflow) \ COUNTER(ejections_consecutive_5xx) \ - COUNTER(ejections_sr) + COUNTER(ejections_success_rate) // clang-format on /** @@ -136,20 +140,20 @@ class DetectorConfig { uint64_t baseEjectionTimeMs() { return base_ejection_time_ms_; } uint64_t consecutive5xx() { return consecutive_5xx_; } uint64_t maxEjectionPercent() { return max_ejection_percent_; } - uint64_t significantHostThreshold() { return significant_host_threshold_; } - uint64_t rqVolumeThreshold() { return rq_volume_threshold_; } + uint64_t successRateMinimumHosts() { return success_rate_minimum_hosts_; } + uint64_t successRateRequestVolume() { return success_rate_request_volume_; } uint64_t enforcingConsecutive5xx() { return enforcing_consecutive_5xx_; } - uint64_t enforcingSR() { return enforcing_sr_; } + uint64_t enforcingSuccessRate() { return enforcing_success_rate_; } private: const uint64_t interval_ms_; const uint64_t base_ejection_time_ms_; const uint64_t consecutive_5xx_; const uint64_t max_ejection_percent_; - const uint64_t significant_host_threshold_; - const uint64_t rq_volume_threshold_; + const uint64_t success_rate_minimum_hosts_; + const uint64_t success_rate_request_volume_; const uint64_t enforcing_consecutive_5xx_; - const uint64_t enforcing_sr_; + const uint64_t enforcing_success_rate_; }; /** @@ -222,17 +226,18 @@ class EventLoggerImpl : public EventLogger { class Utility { public: /** - * This function returns the Success Rate trheshold for Success Rate outlier detection. If a - * host's Success Rate is under this threshold the host is an outlier. - * @param sr_sum is the sum of the data in the sr_data vector. - * @param sr_data is the vector containing the individual success rate data points. - * @return the Success Rate threshold. + * This function returns the success rate threshold for success rate outlier detection. If a + * host's success rate is under this threshold the host is an outlier. + * @param success_rate_sum is the sum of the data in the success_rate_data vector. + * @param success_rate_data is the vector containing the individual success rate data points. + * @return the success rate threshold. */ - static double srEjectionThreshold(double sr_sum, std::vector& sr_data); + static double successRateEjectionThreshold(double success_rate_sum, + std::vector& success_rate_data); private: - // Factor to multiply the stdev of a cluster's Success Rate for success rate outlier ejection. - static const double SR_STDEV_FACTOR; + // Factor to multiply the stdev of a cluster's success rate for success rate outlier ejection. + static const double SUCCESS_RATE_STDEV_FACTOR; }; } // Outlier diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index b4cdf57065b8a..f6ca92e09724f 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -53,7 +53,7 @@ class OutlierDetectorImplTest : public testing::Test { OutlierDetectorImplTest() { ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_consecutive_5xx", 100)) .WillByDefault(Return(true)); - ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_sr", 100)) + ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_success_rate", 100)) .WillByDefault(Return(true)); } @@ -87,7 +87,7 @@ TEST_F(OutlierDetectorImplTest, DetectorStaticConfig) { "consecutive_5xx" : 10, "max_ejection_percent" : 50, "enforcing_consecutive_5xx" : 10, - "enforcing_sr": 20 + "enforcing_success_rate": 20 } )EOF"; @@ -101,7 +101,7 @@ TEST_F(OutlierDetectorImplTest, DetectorStaticConfig) { EXPECT_EQ(10UL, detector->config().consecutive5xx()); EXPECT_EQ(50UL, detector->config().maxEjectionPercent()); EXPECT_EQ(10UL, detector->config().enforcingConsecutive5xx()); - EXPECT_EQ(20UL, detector->config().enforcingSR()); + EXPECT_EQ(20UL, detector->config().enforcingSuccessRate()); } TEST_F(OutlierDetectorImplTest, DestroyWithActive) { @@ -215,7 +215,7 @@ TEST_F(OutlierDetectorImplTest, BasicFlow5xx) { .value()); } -TEST_F(OutlierDetectorImplTest, BasicFlowSR) { +TEST_F(OutlierDetectorImplTest, BasicFlowSuccessRate) { EXPECT_CALL(cluster_, addMemberUpdateCb(_)); cluster_.hosts_ = { HostPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), @@ -522,7 +522,7 @@ TEST(OutlierDetectionEventLoggerImplTest, All) { EXPECT_CALL(*file, write("{\"time\": \"1970-01-01T00:00:00.000Z\", \"secs_since_last_action\": " "\"30\", \"cluster\": " "\"fake_cluster\", \"upstream_url\": \"10.0.0.1:443\", \"action\": " - "\"eject\", \"type\": \"SR\", \"num_ejections\": 0}\n")) + "\"eject\", \"type\": \"SuccessRate\", \"num_ejections\": 0}\n")) .WillOnce(SaveArg<0>(&log3)); event_logger.logEject(host, EjectionType::SuccessRate); Json::Factory::LoadFromString(log3); @@ -541,7 +541,7 @@ TEST(OutlierUtility, SRThreshold) { std::vector data = {50, 100, 100, 100, 100}; double sum = std::accumulate(data.begin(), data.end(), 0.0); - EXPECT_EQ(Utility::srEjectionThreshold(sum, data), 52); + EXPECT_EQ(Utility::successRateEjectionThreshold(sum, data), 52); } } // Outlier From fd5491eb566f56a2949da3bfc6fb08dcb7af801b Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Wed, 22 Mar 2017 14:41:13 -0700 Subject: [PATCH 20/29] schema --- source/common/json/config_schemas.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/common/json/config_schemas.cc b/source/common/json/config_schemas.cc index 40a834ebd049d..59e0093e30264 100644 --- a/source/common/json/config_schemas.cc +++ b/source/common/json/config_schemas.cc @@ -1089,12 +1089,12 @@ const std::string Json::Schema::CLUSTER_SCHEMA(R"EOF( "minimum" : 0, "exclusiveMinimum" : true }, - "significant_host_threshold" : { + "success_rate_minimum_hosts" : { "type" : "integer", "minimum" : 0, "exclusiveMinimum" : true }, - "rq_volume_threshold" : { + "success_rate_request_volume" : { "type" : "integer", "minimum" : 0, "exclusiveMinimum" : true @@ -1119,7 +1119,7 @@ const std::string Json::Schema::CLUSTER_SCHEMA(R"EOF( "minimum" : 0, "maximum" : 100 }, - "enforcing_sr" : { + "enforcing_success_rate" : { "type" : "integer", "minimum" : 0, "maximum" : 100 From ae980da6098d1f2f473f8495ca618667e37728a7 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Wed, 22 Mar 2017 14:59:13 -0700 Subject: [PATCH 21/29] separation --- .../common/upstream/outlier_detection_impl.cc | 19 ++++++++++++------- .../common/upstream/outlier_detection_impl.h | 1 + 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index bedc7d1fb08a2..0743807b4b2f2 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -263,10 +263,7 @@ double Utility::successRateEjectionThreshold(double success_rate_sum, return mean - (SUCCESS_RATE_STDEV_FACTOR * stdev); } -void DetectorImpl::onIntervalTimer() { - SystemTime now = time_source_.currentSystemTime(); - - // data for success rate outlier ejection +void DetectorImpl::successRateEjections() { std::unordered_map valid_success_rate_hosts; uint64_t success_rate_minimum_hosts = runtime_.snapshot().getInteger( "outlier_detection.success_rate_minimum_hosts", config_.successRateMinimumHosts()); @@ -281,13 +278,11 @@ void DetectorImpl::onIntervalTimer() { } for (auto host : host_sinks_) { - checkHostForUneject(host.first, host.second, now); - host.second->updateCurrentSuccessRateBucket(); // If there are not enough hosts to begin with, don't do the work. if (host_sinks_.size() >= success_rate_minimum_hosts) { - // Don't do work if host is already ejected. + // Don't do work if the host is already ejected. if (!host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) { Optional host_success_rate = host.second->successRateAccumulator().getSuccessRate(success_rate_request_volume); @@ -311,6 +306,16 @@ void DetectorImpl::onIntervalTimer() { } } } +} + +void DetectorImpl::onIntervalTimer() { + SystemTime now = time_source_.currentSystemTime(); + + for (auto host : host_sinks_) { + checkHostForUneject(host.first, host.second, now); + } + + successRateEjections(); armIntervalTimer(); } diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 5d125537323eb..bb3dfd7c3e7e9 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -190,6 +190,7 @@ class DetectorImpl : public Detector, public std::enable_shared_from_this Date: Wed, 22 Mar 2017 15:44:40 -0700 Subject: [PATCH 22/29] fix tests after merge --- .../upstream/outlier_detection_impl_test.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index 4af2e819673cc..8757a2c25b618 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -57,13 +57,13 @@ class OutlierDetectorImplTest : public testing::Test { .WillByDefault(Return(true)); } - void loadRq(std::vector& hosts, int num_rq, int http_code) { + void loadRq(std::vector& hosts, int num_rq, int http_code) { for (uint64_t i = 0; i < hosts.size(); i++) { loadRq(hosts[i], num_rq, http_code); } } - void loadRq(HostPtr host, int num_rq, int http_code) { + void loadRq(HostSharedPtr host, int num_rq, int http_code) { for (int i = 0; i < num_rq; i++) { host->outlierDetector().putHttpResponseCode(http_code); } @@ -218,20 +218,20 @@ TEST_F(OutlierDetectorImplTest, BasicFlow5xx) { TEST_F(OutlierDetectorImplTest, BasicFlowSuccessRate) { EXPECT_CALL(cluster_, addMemberUpdateCb(_)); cluster_.hosts_ = { - HostPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), + HostSharedPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}, - HostPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:81"), + HostSharedPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:81"), false, 1, "")}, - HostPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:82"), + HostSharedPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:82"), false, 1, "")}, - HostPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:83"), + HostSharedPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:83"), false, 1, "")}, - HostPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:84"), + HostSharedPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:84"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); - detector->addChangedStateCb([&](HostPtr host) -> void { checker_.check(host); }); + detector->addChangedStateCb([&](HostSharedPtr host) -> void { checker_.check(host); }); // Turn off 5xx detection to test SR detection in isolation. ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_consecutive_5xx", 100)) From b196f43a67d3bc1bf1696c6ae1e8639f896014b2 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Wed, 22 Mar 2017 15:55:07 -0700 Subject: [PATCH 23/29] fix --- .../upstream/outlier_detection_impl_test.cc | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index 8757a2c25b618..5de7079e97bc2 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -218,16 +218,16 @@ TEST_F(OutlierDetectorImplTest, BasicFlow5xx) { TEST_F(OutlierDetectorImplTest, BasicFlowSuccessRate) { EXPECT_CALL(cluster_, addMemberUpdateCb(_)); cluster_.hosts_ = { - HostSharedPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), - false, 1, "")}, - HostSharedPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:81"), - false, 1, "")}, - HostSharedPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:82"), - false, 1, "")}, - HostSharedPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:83"), - false, 1, "")}, - HostSharedPtr{new HostImpl(cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:84"), - false, 1, "")}}; + HostSharedPtr{new HostImpl(cluster_.info_, "", + Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}, + HostSharedPtr{new HostImpl(cluster_.info_, "", + Network::Utility::resolveUrl("tcp://127.0.0.1:81"), false, 1, "")}, + HostSharedPtr{new HostImpl(cluster_.info_, "", + Network::Utility::resolveUrl("tcp://127.0.0.1:82"), false, 1, "")}, + HostSharedPtr{new HostImpl(cluster_.info_, "", + Network::Utility::resolveUrl("tcp://127.0.0.1:83"), false, 1, "")}, + HostSharedPtr{new HostImpl( + cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:84"), false, 1, "")}}; EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); std::shared_ptr detector( DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); From cd3867a2b85ab8741a099e57218858b2c3a27695 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Wed, 22 Mar 2017 16:09:29 -0700 Subject: [PATCH 24/29] remove clion include --- source/common/upstream/outlier_detection_impl.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 3d3bd3e4d6fc2..9790578691e79 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -1,4 +1,3 @@ -#include #include "outlier_detection_impl.h" #include "envoy/event/dispatcher.h" From e0f3a9fc932ed76298695f9b0b9c17840d33bd48 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Wed, 22 Mar 2017 16:13:01 -0700 Subject: [PATCH 25/29] fix comment --- source/common/upstream/outlier_detection_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index c5df346850822..7b8ba46e1cf03 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -83,7 +83,7 @@ class DetectorHostSinkImpl : public DetectorHostSink { public: DetectorHostSinkImpl(std::shared_ptr detector, HostSharedPtr host) : detector_(detector), host_(host) { - // Point the sr_accumulator_bucket_ pointer to a bucket. + // Point the success_rate_accumulator_bucket_ pointer to a bucket. updateCurrentSuccessRateBucket(); } From d5750e34189e07c5890c88d82d48cf6cb23fce9e Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Wed, 22 Mar 2017 21:50:24 -0700 Subject: [PATCH 26/29] comments --- .../common/upstream/outlier_detection_impl.cc | 61 ++++++++++--------- .../common/upstream/outlier_detection_impl.h | 5 +- .../upstream/outlier_detection_impl_test.cc | 7 ++- 3 files changed, 41 insertions(+), 32 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 9790578691e79..bade762b80374 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -234,10 +234,15 @@ void DetectorImpl::onConsecutive5xxWorker(HostSharedPtr host) { ejectHost(host, EjectionType::Consecutive5xx); } +// The canonical factor for outlier detection in normal distributions is 2. However, host +// success rates are intuitively a distribution with negative skew, with most of the mass around +// 100 and a left tail. Therefore, a more aggressive (lower) factor is needed to detect +// outliers. const double Utility::SUCCESS_RATE_STDEV_FACTOR = 1.9; -double Utility::successRateEjectionThreshold(double success_rate_sum, - std::vector& success_rate_data) { +double Utility::successRateEjectionThreshold( + double success_rate_sum, + const std::vector>& valid_success_rate_hosts) { // This function is using mean and standard deviation as statistical measures for outlier // detection. First the mean is calculated by dividing the sum of success rate data over the // number of data points. Then variance is calculated by taking the mean of the @@ -253,56 +258,56 @@ double Utility::successRateEjectionThreshold(double success_rate_sum, // variance = 400 // stdev = 20 // threshold returned = 52 - double mean = success_rate_sum / success_rate_data.size(); + double mean = success_rate_sum / valid_success_rate_hosts.size(); double variance = 0; - std::for_each(success_rate_data.begin(), success_rate_data.end(), - [&variance, mean](double& v) { variance += std::pow(v - mean, 2); }); - variance /= success_rate_data.size(); + std::for_each(valid_success_rate_hosts.begin(), valid_success_rate_hosts.end(), + [&variance, mean](std::tuple v) { + variance += std::pow(std::get<1>(v) - mean, 2); + }); + variance /= valid_success_rate_hosts.size(); double stdev = std::sqrt(variance); return mean - (SUCCESS_RATE_STDEV_FACTOR * stdev); } void DetectorImpl::successRateEjections() { - std::unordered_map valid_success_rate_hosts; uint64_t success_rate_minimum_hosts = runtime_.snapshot().getInteger( "outlier_detection.success_rate_minimum_hosts", config_.successRateMinimumHosts()); uint64_t success_rate_request_volume = runtime_.snapshot().getInteger( "outlier_detection.success_rate_request_volume", config_.successRateRequestVolume()); - std::vector success_rate_data; + std::vector> valid_success_rate_hosts; double success_rate_sum = 0; - if (host_sinks_.size() >= success_rate_minimum_hosts) { - // reserve upper bound of vector size to avoid reallocation. - success_rate_data.reserve(host_sinks_.size()); + // Exit early if there are not enough hosts. + if (host_sinks_.size() < success_rate_minimum_hosts) { + return; } + // reserve upper bound of vector size to avoid reallocation. + valid_success_rate_hosts.reserve(host_sinks_.size()); + for (auto host : host_sinks_) { host.second->updateCurrentSuccessRateBucket(); - - // If there are not enough hosts to begin with, don't do the work. - if (host_sinks_.size() >= success_rate_minimum_hosts) { - // Don't do work if the host is already ejected. - if (!host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) { - Optional host_success_rate = - host.second->successRateAccumulator().getSuccessRate(success_rate_request_volume); - - if (host_success_rate.valid()) { - valid_success_rate_hosts[host.first] = host_success_rate.value(); - success_rate_data.emplace_back(host_success_rate.value()); - success_rate_sum += host_success_rate.value(); - } + // Don't do work if the host is already ejected. + if (!host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) { + Optional host_success_rate = + host.second->successRateAccumulator().getSuccessRate(success_rate_request_volume); + + if (host_success_rate.valid()) { + valid_success_rate_hosts.emplace_back( + std::make_tuple(host.first, host_success_rate.value())); + success_rate_sum += host_success_rate.value(); } } } if (valid_success_rate_hosts.size() >= success_rate_minimum_hosts) { double ejection_threshold = - Utility::successRateEjectionThreshold(success_rate_sum, success_rate_data); - for (auto host : valid_success_rate_hosts) { - if (host.second < ejection_threshold) { + Utility::successRateEjectionThreshold(success_rate_sum, valid_success_rate_hosts); + for (auto tuple : valid_success_rate_hosts) { + if (std::get<1>(tuple) < ejection_threshold) { stats_.ejections_success_rate_.inc(); - ejectHost(host.first, EjectionType::SuccessRate); + ejectHost(std::get<0>(tuple), EjectionType::SuccessRate); } } } diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 7b8ba46e1cf03..3169449ab3896 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -234,8 +234,9 @@ class Utility { * @param success_rate_data is the vector containing the individual success rate data points. * @return the success rate threshold. */ - static double successRateEjectionThreshold(double success_rate_sum, - std::vector& success_rate_data); + static double successRateEjectionThreshold( + double success_rate_sum, + const std::vector>& valid_success_rate_hosts); private: // Factor to multiply the stdev of a cluster's success rate for success rate outlier ejection. diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index 5de7079e97bc2..32061e29849df 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -538,8 +538,11 @@ TEST(OutlierDetectionEventLoggerImplTest, All) { } TEST(OutlierUtility, SRThreshold) { - std::vector data = {50, 100, 100, 100, 100}; - double sum = std::accumulate(data.begin(), data.end(), 0.0); + std::vector> data = { + std::make_tuple(nullptr, 50), std::make_tuple(nullptr, 100), std::make_tuple(nullptr, 100), + std::make_tuple(nullptr, 100), std::make_tuple(nullptr, 100), + }; + double sum = 450; EXPECT_EQ(Utility::successRateEjectionThreshold(sum, data), 52); } From 607512c9e4e16511d58ae86613472663b3e2ae29 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Thu, 23 Mar 2017 10:30:32 -0700 Subject: [PATCH 27/29] update comments --- .../common/upstream/outlier_detection_impl.cc | 21 +++++++++--------- .../common/upstream/outlier_detection_impl.h | 22 ++++++++++++++----- .../upstream/outlier_detection_impl_test.cc | 7 +++--- 3 files changed, 30 insertions(+), 20 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index bade762b80374..4297938e168f9 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -241,8 +241,7 @@ void DetectorImpl::onConsecutive5xxWorker(HostSharedPtr host) { const double Utility::SUCCESS_RATE_STDEV_FACTOR = 1.9; double Utility::successRateEjectionThreshold( - double success_rate_sum, - const std::vector>& valid_success_rate_hosts) { + double success_rate_sum, const std::vector& valid_success_rate_hosts) { // This function is using mean and standard deviation as statistical measures for outlier // detection. First the mean is calculated by dividing the sum of success rate data over the // number of data points. Then variance is calculated by taking the mean of the @@ -261,8 +260,8 @@ double Utility::successRateEjectionThreshold( double mean = success_rate_sum / valid_success_rate_hosts.size(); double variance = 0; std::for_each(valid_success_rate_hosts.begin(), valid_success_rate_hosts.end(), - [&variance, mean](std::tuple v) { - variance += std::pow(std::get<1>(v) - mean, 2); + [&variance, mean](HostSuccessRatePair v) { + variance += std::pow(v.success_rate_ - mean, 2); }); variance /= valid_success_rate_hosts.size(); double stdev = std::sqrt(variance); @@ -275,7 +274,7 @@ void DetectorImpl::successRateEjections() { "outlier_detection.success_rate_minimum_hosts", config_.successRateMinimumHosts()); uint64_t success_rate_request_volume = runtime_.snapshot().getInteger( "outlier_detection.success_rate_request_volume", config_.successRateRequestVolume()); - std::vector> valid_success_rate_hosts; + std::vector valid_success_rate_hosts; double success_rate_sum = 0; // Exit early if there are not enough hosts. @@ -286,7 +285,7 @@ void DetectorImpl::successRateEjections() { // reserve upper bound of vector size to avoid reallocation. valid_success_rate_hosts.reserve(host_sinks_.size()); - for (auto host : host_sinks_) { + for (const auto& host : host_sinks_) { host.second->updateCurrentSuccessRateBucket(); // Don't do work if the host is already ejected. if (!host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) { @@ -295,7 +294,7 @@ void DetectorImpl::successRateEjections() { if (host_success_rate.valid()) { valid_success_rate_hosts.emplace_back( - std::make_tuple(host.first, host_success_rate.value())); + HostSuccessRatePair(host.first, host_success_rate.value())); success_rate_sum += host_success_rate.value(); } } @@ -304,10 +303,10 @@ void DetectorImpl::successRateEjections() { if (valid_success_rate_hosts.size() >= success_rate_minimum_hosts) { double ejection_threshold = Utility::successRateEjectionThreshold(success_rate_sum, valid_success_rate_hosts); - for (auto tuple : valid_success_rate_hosts) { - if (std::get<1>(tuple) < ejection_threshold) { + for (const auto& host_success_rate_pair : valid_success_rate_hosts) { + if (host_success_rate_pair.success_rate_ < ejection_threshold) { stats_.ejections_success_rate_.inc(); - ejectHost(std::get<0>(tuple), EjectionType::SuccessRate); + ejectHost(host_success_rate_pair.host_, EjectionType::SuccessRate); } } } @@ -406,7 +405,7 @@ Optional SuccessRateAccumulator::getSuccessRate(uint64_t success_rate_re return Optional(); } - return Optional(backup_success_rate_bucket_->success_request_counter_ * 100 / + return Optional(backup_success_rate_bucket_->success_request_counter_ * 100.0 / backup_success_rate_bucket_->total_request_counter_); } diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 3169449ab3896..4285336e920ac 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -37,6 +37,16 @@ class DetectorImplFactory { EventLoggerSharedPtr event_logger); }; +/** + * Thin struct to facilitate calculations for success rate outlier detection. + */ +struct HostSuccessRatePair { + HostSuccessRatePair(HostSharedPtr host, double success_rate) + : host_(host), success_rate_(success_rate) {} + HostSharedPtr host_; + double success_rate_; +}; + struct SuccessRateAccumulatorBucket { std::atomic success_request_counter_; std::atomic total_request_counter_; @@ -62,8 +72,8 @@ class SuccessRateAccumulator { * This function returns the success rate of a host over a window of time if the request volume is * high enough. The underlying window of time could be dynamically adjusted. In the current * implementation it is a fixed time window. - * @param request_volume_threshold the threshold of requests an accumulator has to have in order - * to be able to return a significant success rate value. + * @param success_rate_request_volume the threshold of requests an accumulator has to have in order + * to be able to return a significant success rate value. * @return a valid Optional with the success rate. If there were not enough requests, an * invalid Optional is returned. */ @@ -223,7 +233,7 @@ class EventLoggerImpl : public EventLogger { }; /** - * Utilities for Outlier Detection + * Utilities for Outlier Detection. */ class Utility { public: @@ -234,9 +244,9 @@ class Utility { * @param success_rate_data is the vector containing the individual success rate data points. * @return the success rate threshold. */ - static double successRateEjectionThreshold( - double success_rate_sum, - const std::vector>& valid_success_rate_hosts); + static double + successRateEjectionThreshold(double success_rate_sum, + const std::vector& valid_success_rate_hosts); private: // Factor to multiply the stdev of a cluster's success rate for success rate outlier ejection. diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index 32061e29849df..e44c1f372bb8d 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -538,9 +538,10 @@ TEST(OutlierDetectionEventLoggerImplTest, All) { } TEST(OutlierUtility, SRThreshold) { - std::vector> data = { - std::make_tuple(nullptr, 50), std::make_tuple(nullptr, 100), std::make_tuple(nullptr, 100), - std::make_tuple(nullptr, 100), std::make_tuple(nullptr, 100), + std::vector data = { + HostSuccessRatePair(nullptr, 50), HostSuccessRatePair(nullptr, 100), + HostSuccessRatePair(nullptr, 100), HostSuccessRatePair(nullptr, 100), + HostSuccessRatePair(nullptr, 100), }; double sum = 450; From 49c05874a4a4a7b0d16083c3429f79b0c3132295 Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Thu, 23 Mar 2017 10:52:21 -0700 Subject: [PATCH 28/29] fix format --- source/common/upstream/outlier_detection_impl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 4285336e920ac..af3d13034600d 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -72,8 +72,8 @@ class SuccessRateAccumulator { * This function returns the success rate of a host over a window of time if the request volume is * high enough. The underlying window of time could be dynamically adjusted. In the current * implementation it is a fixed time window. - * @param success_rate_request_volume the threshold of requests an accumulator has to have in order - * to be able to return a significant success rate value. + * @param success_rate_request_volume the threshold of requests an accumulator has to have in + * order to be able to return a significant success rate value. * @return a valid Optional with the success rate. If there were not enough requests, an * invalid Optional is returned. */ From 05dec2233da4288d98eb1ae4895ec8e761b33d7b Mon Sep 17 00:00:00 2001 From: Jose Nino Date: Thu, 23 Mar 2017 11:13:15 -0700 Subject: [PATCH 29/29] name change --- source/common/upstream/outlier_detection_impl.cc | 4 ++-- source/common/upstream/outlier_detection_impl.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 4297938e168f9..d5d40d6072d30 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -269,7 +269,7 @@ double Utility::successRateEjectionThreshold( return mean - (SUCCESS_RATE_STDEV_FACTOR * stdev); } -void DetectorImpl::successRateEjections() { +void DetectorImpl::processSuccessRateEjections() { uint64_t success_rate_minimum_hosts = runtime_.snapshot().getInteger( "outlier_detection.success_rate_minimum_hosts", config_.successRateMinimumHosts()); uint64_t success_rate_request_volume = runtime_.snapshot().getInteger( @@ -319,7 +319,7 @@ void DetectorImpl::onIntervalTimer() { checkHostForUneject(host.first, host.second, now); } - successRateEjections(); + processSuccessRateEjections(); armIntervalTimer(); } diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index af3d13034600d..18744966a1a25 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -201,7 +201,7 @@ class DetectorImpl : public Detector, public std::enable_shared_from_this