diff --git a/docs/configuration/cluster_manager/cluster.rst b/docs/configuration/cluster_manager/cluster.rst index cebb48b09aa38..cb78a199b917a 100644 --- a/docs/configuration/cluster_manager/cluster.rst +++ b/docs/configuration/cluster_manager/cluster.rst @@ -168,11 +168,33 @@ outlier_detection max_ejection_percent The maximum % of an upstream cluster that can be ejected due to outlier detection. Defaults to 10%. - .. _config_cluster_manager_cluster_outlier_detection_enforcing: + .. _config_cluster_manager_cluster_outlier_detection_enforcing_consecutive_5xx: - enforcing - The % chance that a host will be actually ejected when an outlier status is detected. This setting - can be used to disable ejection or to ramp it up slowly. Defaults to 100. + enforcing_consecutive_5xx + The % chance that a host will be actually ejected when an outlier status is detected through + consecutive 5xx. This setting can be used to disable ejection or to ramp it up slowly. Defaults to 100. + + .. _config_cluster_manager_cluster_outlier_detection_enforcing_success_rate: + + enforcing_success_rate + The % chance that a host will be actually ejected when an outlier status is detected through + success rate statistics. This setting can be used to disable ejection or to ramp it up slowly. + Defaults to 100. + + .. _config_cluster_manager_cluster_outlier_detection_success_rate_minimum_hosts: + + success_rate_minimum_hosts + The number of hosts in a cluster that must have enough request volume to detect success rate outliers. + If the number of hosts is less than this setting, outlier detection via success rate statistics is not + performed for any host in the cluster. Defaults to 5. + + .. _config_cluster_manager_cluster_outlier_detection_success_rate_request_volume: + + success_rate_request_volume + The minimum number of total requests that must be collected in one interval + (as defined by :ref:`interval_ms ` above) + to include this host in success rate based outlier detection. If the volume is lower than this setting, + outlier detection via success rate statistics is not performed for that host. Defaults to 100. Each of the above configuration values can be overridden via :ref:`runtime values `. diff --git a/docs/configuration/cluster_manager/cluster_runtime.rst b/docs/configuration/cluster_manager/cluster_runtime.rst index 427fe57da3fc5..d284d3b935d17 100644 --- a/docs/configuration/cluster_manager/cluster_runtime.rst +++ b/docs/configuration/cluster_manager/cluster_runtime.rst @@ -52,9 +52,24 @@ outlier_detection.max_ejection_percent ` setting in outlier detection -outlier_detection.enforcing - :ref:`enforcing - ` +outlier_detection.enforcing_consecutive_5xx + :ref:`enforcing_consecutive_5xx + ` + setting in outlier detection + +outlier_detection.enforcing_success_rate + :ref:`enforcing_success_rate + ` + setting in outlier detection + +outlier_detection.success_rate_minimum_hosts + :ref:`success_rate_minimum_hosts + ` + setting in outlier detection + +outlier_detection.success_rate_request_volume + :ref:`success_rate_request_volume + ` setting in outlier detection Core diff --git a/docs/intro/arch_overview/outlier.rst b/docs/intro/arch_overview/outlier.rst index ee333648e6550..160f1ffb0b76b 100644 --- a/docs/intro/arch_overview/outlier.rst +++ b/docs/intro/arch_overview/outlier.rst @@ -46,7 +46,19 @@ If an upstream host returns some number of consecutive 5xx, it will be ejected. case a 5xx means an actual 5xx respond code, or an event that would cause the HTTP router to return one on the upstream's behalf (reset, connection failure, etc.). The number of consecutive 5xx required for ejection is controlled by the :ref:`outlier_detection.consecutive_5xx -` value. +` value. + +Success Rate +^^^^^^^^^^^^ + +Success Rate based outlier ejection aggregates success rate data from every host in a cluster. Then at given +intervals ejects hosts based on statistical outlier detection. Success Rate outlier ejection will not be +calculated for a host if its request volume over the aggregation interval is less than the +:ref:`outlier_detection.success_rate_request_volume` +value. Moreover, detection will not be performed for a cluster if the number of hosts +with the minimum required request volume in an interval is less than the +:ref:`outlier_detection.success_rate_minimum_hosts` +value. Ejection event logging ---------------------- diff --git a/include/envoy/upstream/outlier_detection.h b/include/envoy/upstream/outlier_detection.h index 263639f1dabd2..f4c9cb256dab4 100644 --- a/include/envoy/upstream/outlier_detection.h +++ b/include/envoy/upstream/outlier_detection.h @@ -52,7 +52,7 @@ class DetectorHostSink { typedef std::unique_ptr DetectorHostSinkPtr; -enum class EjectionType { Consecutive5xx }; +enum class EjectionType { Consecutive5xx, SuccessRate }; /** * Sink for outlier detection event logs. diff --git a/source/common/json/config_schemas.cc b/source/common/json/config_schemas.cc index f40baa6fa7155..59e0093e30264 100644 --- a/source/common/json/config_schemas.cc +++ b/source/common/json/config_schemas.cc @@ -1089,6 +1089,16 @@ const std::string Json::Schema::CLUSTER_SCHEMA(R"EOF( "minimum" : 0, "exclusiveMinimum" : true }, + "success_rate_minimum_hosts" : { + "type" : "integer", + "minimum" : 0, + "exclusiveMinimum" : true + }, + "success_rate_request_volume" : { + "type" : "integer", + "minimum" : 0, + "exclusiveMinimum" : true + }, "interval_ms" : { "type" : "integer", "minimum" : 0, @@ -1104,7 +1114,12 @@ const std::string Json::Schema::CLUSTER_SCHEMA(R"EOF( "minimum" : 0, "maximum" : 100 }, - "enforcing" : { + "enforcing_consecutive_5xx" : { + "type" : "integer", + "minimum" : 0, + "maximum" : 100 + }, + "enforcing_success_rate" : { "type" : "integer", "minimum" : 0, "maximum" : 100 diff --git a/source/common/upstream/outlier_detection_impl.cc b/source/common/upstream/outlier_detection_impl.cc index 4d32e9edc2120..d5d40d6072d30 100644 --- a/source/common/upstream/outlier_detection_impl.cc +++ b/source/common/upstream/outlier_detection_impl.cc @@ -33,7 +33,12 @@ void DetectorHostSinkImpl::uneject(SystemTime unejection_time) { last_unejection_time_.value(unejection_time); } +void DetectorHostSinkImpl::updateCurrentSuccessRateBucket() { + success_rate_accumulator_bucket_.store(success_rate_accumulator_.updateCurrentWriter()); +} + void DetectorHostSinkImpl::putHttpResponseCode(uint64_t response_code) { + success_rate_accumulator_bucket_.load()->total_request_counter_++; if (Http::CodeUtility::is5xx(response_code)) { std::shared_ptr detector = detector_.lock(); if (!detector) { @@ -47,6 +52,7 @@ void DetectorHostSinkImpl::putHttpResponseCode(uint64_t response_code) { detector->onConsecutive5xx(host_.lock()); } } else { + success_rate_accumulator_bucket_.load()->success_request_counter_++; consecutive_5xx_ = 0; } } @@ -58,7 +64,14 @@ DetectorConfig::DetectorConfig(const Json::Object& json_config) consecutive_5xx_(static_cast(json_config.getInteger("consecutive_5xx", 5))), max_ejection_percent_( static_cast(json_config.getInteger("max_ejection_percent", 10))), - enforcing_(static_cast(json_config.getInteger("enforcing", 100))) {} + success_rate_minimum_hosts_( + static_cast(json_config.getInteger("success_rate_minimum_hosts", 5))), + success_rate_request_volume_( + static_cast(json_config.getInteger("success_rate_request_volume", 100))), + enforcing_consecutive_5xx_( + static_cast(json_config.getInteger("enforcing_consecutive_5xx", 100))), + enforcing_success_rate_( + static_cast(json_config.getInteger("enforcing_success_rate", 100))) {} DetectorImpl::DetectorImpl(const Cluster& cluster, const Json::Object& json_config, Event::Dispatcher& dispatcher, Runtime::Loader& runtime, @@ -146,6 +159,19 @@ void DetectorImpl::checkHostForUneject(HostSharedPtr host, DetectorHostSinkImpl* } } +bool DetectorImpl::enforceEjection(EjectionType type) { + switch (type) { + case EjectionType::Consecutive5xx: + return runtime_.snapshot().featureEnabled("outlier_detection.enforcing_consecutive_5xx", + config_.enforcingConsecutive5xx()); + case EjectionType::SuccessRate: + return runtime_.snapshot().featureEnabled("outlier_detection.enforcing_success_rate", + config_.enforcingSuccessRate()); + } + + NOT_REACHED; +} + void DetectorImpl::ejectHost(HostSharedPtr host, EjectionType type) { uint64_t max_ejection_percent = std::min( 100, runtime_.snapshot().getInteger("outlier_detection.max_ejection_percent", @@ -153,7 +179,7 @@ void DetectorImpl::ejectHost(HostSharedPtr host, EjectionType type) { double ejected_percent = 100.0 * stats_.ejections_active_.value() / host_sinks_.size(); if (ejected_percent < max_ejection_percent) { stats_.ejections_total_.inc(); - if (runtime_.snapshot().featureEnabled("outlier_detection.enforcing", config_.enforcing())) { + if (enforceEjection(type)) { stats_.ejections_active_.inc(); host_sinks_[host]->eject(time_source_.currentSystemTime()); runCallbacks(host); @@ -208,12 +234,93 @@ void DetectorImpl::onConsecutive5xxWorker(HostSharedPtr host) { ejectHost(host, EjectionType::Consecutive5xx); } +// The canonical factor for outlier detection in normal distributions is 2. However, host +// success rates are intuitively a distribution with negative skew, with most of the mass around +// 100 and a left tail. Therefore, a more aggressive (lower) factor is needed to detect +// outliers. +const double Utility::SUCCESS_RATE_STDEV_FACTOR = 1.9; + +double Utility::successRateEjectionThreshold( + double success_rate_sum, const std::vector& valid_success_rate_hosts) { + // This function is using mean and standard deviation as statistical measures for outlier + // detection. First the mean is calculated by dividing the sum of success rate data over the + // number of data points. Then variance is calculated by taking the mean of the + // squared difference of data points to the mean of the data. Then standard deviation is + // calculated by taking the square root of the variance. Then the outlier threshold is + // calculated as the difference between the mean and the product of the standard + // deviation and a constant factor. + // + // For example with a data set that looks like success_rate_data = {50, 100, 100, 100, 100} the + // math would work as follows: + // success_rate_sum = 450 + // mean = 90 + // variance = 400 + // stdev = 20 + // threshold returned = 52 + double mean = success_rate_sum / valid_success_rate_hosts.size(); + double variance = 0; + std::for_each(valid_success_rate_hosts.begin(), valid_success_rate_hosts.end(), + [&variance, mean](HostSuccessRatePair v) { + variance += std::pow(v.success_rate_ - mean, 2); + }); + variance /= valid_success_rate_hosts.size(); + double stdev = std::sqrt(variance); + + return mean - (SUCCESS_RATE_STDEV_FACTOR * stdev); +} + +void DetectorImpl::processSuccessRateEjections() { + uint64_t success_rate_minimum_hosts = runtime_.snapshot().getInteger( + "outlier_detection.success_rate_minimum_hosts", config_.successRateMinimumHosts()); + uint64_t success_rate_request_volume = runtime_.snapshot().getInteger( + "outlier_detection.success_rate_request_volume", config_.successRateRequestVolume()); + std::vector valid_success_rate_hosts; + double success_rate_sum = 0; + + // Exit early if there are not enough hosts. + if (host_sinks_.size() < success_rate_minimum_hosts) { + return; + } + + // reserve upper bound of vector size to avoid reallocation. + valid_success_rate_hosts.reserve(host_sinks_.size()); + + for (const auto& host : host_sinks_) { + host.second->updateCurrentSuccessRateBucket(); + // Don't do work if the host is already ejected. + if (!host.first->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)) { + Optional host_success_rate = + host.second->successRateAccumulator().getSuccessRate(success_rate_request_volume); + + if (host_success_rate.valid()) { + valid_success_rate_hosts.emplace_back( + HostSuccessRatePair(host.first, host_success_rate.value())); + success_rate_sum += host_success_rate.value(); + } + } + } + + if (valid_success_rate_hosts.size() >= success_rate_minimum_hosts) { + double ejection_threshold = + Utility::successRateEjectionThreshold(success_rate_sum, valid_success_rate_hosts); + for (const auto& host_success_rate_pair : valid_success_rate_hosts) { + if (host_success_rate_pair.success_rate_ < ejection_threshold) { + stats_.ejections_success_rate_.inc(); + ejectHost(host_success_rate_pair.host_, EjectionType::SuccessRate); + } + } + } +} + void DetectorImpl::onIntervalTimer() { SystemTime now = time_source_.currentSystemTime(); + for (auto host : host_sinks_) { checkHostForUneject(host.first, host.second, now); } + processSuccessRateEjections(); + armIntervalTimer(); } @@ -268,9 +375,11 @@ std::string EventLoggerImpl::typeToString(EjectionType type) { switch (type) { case EjectionType::Consecutive5xx: return "5xx"; + case EjectionType::SuccessRate: + return "SuccessRate"; } - NOT_IMPLEMENTED; + NOT_REACHED; } int EventLoggerImpl::secsSinceLastAction(const Optional& lastActionTime, @@ -281,5 +390,24 @@ int EventLoggerImpl::secsSinceLastAction(const Optional& lastActionT return -1; } +SuccessRateAccumulatorBucket* SuccessRateAccumulator::updateCurrentWriter() { + // Right now current is being written to and backup is not. Flush the backup and swap. + backup_success_rate_bucket_->success_request_counter_ = 0; + backup_success_rate_bucket_->total_request_counter_ = 0; + + current_success_rate_bucket_.swap(backup_success_rate_bucket_); + + return current_success_rate_bucket_.get(); +} + +Optional SuccessRateAccumulator::getSuccessRate(uint64_t success_rate_request_volume) { + if (backup_success_rate_bucket_->total_request_counter_ < success_rate_request_volume) { + return Optional(); + } + + return Optional(backup_success_rate_bucket_->success_request_counter_ * 100.0 / + backup_success_rate_bucket_->total_request_counter_); +} + } // Outlier } // Upstream diff --git a/source/common/upstream/outlier_detection_impl.h b/source/common/upstream/outlier_detection_impl.h index 898217dc700a1..18744966a1a25 100644 --- a/source/common/upstream/outlier_detection_impl.h +++ b/source/common/upstream/outlier_detection_impl.h @@ -37,6 +37,53 @@ class DetectorImplFactory { EventLoggerSharedPtr event_logger); }; +/** + * Thin struct to facilitate calculations for success rate outlier detection. + */ +struct HostSuccessRatePair { + HostSuccessRatePair(HostSharedPtr host, double success_rate) + : host_(host), success_rate_(success_rate) {} + HostSharedPtr host_; + double success_rate_; +}; + +struct SuccessRateAccumulatorBucket { + std::atomic success_request_counter_; + std::atomic total_request_counter_; +}; + +/** + * The SuccessRateAccumulator uses the SuccessRateAccumulatorBucket to get per host success rate + * stats. This implementation has a fixed window size of time, and thus only needs a + * bucket to write to, and a bucket to accumulate/run stats over. + */ +class SuccessRateAccumulator { +public: + SuccessRateAccumulator() + : current_success_rate_bucket_(new SuccessRateAccumulatorBucket()), + backup_success_rate_bucket_(new SuccessRateAccumulatorBucket()) {} + + /** + * This function updates the bucket to write data to. + * @return a pointer to the SuccessRateAccumulatorBucket. + */ + SuccessRateAccumulatorBucket* updateCurrentWriter(); + /** + * This function returns the success rate of a host over a window of time if the request volume is + * high enough. The underlying window of time could be dynamically adjusted. In the current + * implementation it is a fixed time window. + * @param success_rate_request_volume the threshold of requests an accumulator has to have in + * order to be able to return a significant success rate value. + * @return a valid Optional with the success rate. If there were not enough requests, an + * invalid Optional is returned. + */ + Optional getSuccessRate(uint64_t success_rate_request_volume); + +private: + std::unique_ptr current_success_rate_bucket_; + std::unique_ptr backup_success_rate_bucket_; +}; + class DetectorImpl; /** @@ -45,10 +92,15 @@ class DetectorImpl; class DetectorHostSinkImpl : public DetectorHostSink { public: DetectorHostSinkImpl(std::shared_ptr detector, HostSharedPtr host) - : detector_(detector), host_(host) {} + : detector_(detector), host_(host) { + // Point the success_rate_accumulator_bucket_ pointer to a bucket. + updateCurrentSuccessRateBucket(); + } void eject(SystemTime ejection_time); void uneject(SystemTime ejection_time); + void updateCurrentSuccessRateBucket(); + SuccessRateAccumulator& successRateAccumulator() { return success_rate_accumulator_; }; // Upstream::Outlier::DetectorHostSink uint32_t numEjections() override { return num_ejections_; } @@ -64,6 +116,8 @@ class DetectorHostSinkImpl : public DetectorHostSink { Optional last_ejection_time_; Optional last_unejection_time_; uint32_t num_ejections_{}; + SuccessRateAccumulator success_rate_accumulator_; + std::atomic success_rate_accumulator_bucket_; }; /** @@ -74,7 +128,8 @@ class DetectorHostSinkImpl : public DetectorHostSink { COUNTER(ejections_total) \ GAUGE (ejections_active) \ COUNTER(ejections_overflow) \ - COUNTER(ejections_consecutive_5xx) + COUNTER(ejections_consecutive_5xx) \ + COUNTER(ejections_success_rate) // clang-format on /** @@ -95,14 +150,20 @@ class DetectorConfig { uint64_t baseEjectionTimeMs() { return base_ejection_time_ms_; } uint64_t consecutive5xx() { return consecutive_5xx_; } uint64_t maxEjectionPercent() { return max_ejection_percent_; } - uint64_t enforcing() { return enforcing_; } + uint64_t successRateMinimumHosts() { return success_rate_minimum_hosts_; } + uint64_t successRateRequestVolume() { return success_rate_request_volume_; } + uint64_t enforcingConsecutive5xx() { return enforcing_consecutive_5xx_; } + uint64_t enforcingSuccessRate() { return enforcing_success_rate_; } private: const uint64_t interval_ms_; const uint64_t base_ejection_time_ms_; const uint64_t consecutive_5xx_; const uint64_t max_ejection_percent_; - const uint64_t enforcing_; + const uint64_t success_rate_minimum_hosts_; + const uint64_t success_rate_request_volume_; + const uint64_t enforcing_consecutive_5xx_; + const uint64_t enforcing_success_rate_; }; /** @@ -139,6 +200,8 @@ class DetectorImpl : public Detector, public std::enable_shared_from_this& valid_success_rate_hosts); + +private: + // Factor to multiply the stdev of a cluster's success rate for success rate outlier ejection. + static const double SUCCESS_RATE_STDEV_FACTOR; +}; + } // Outlier } // Upstream diff --git a/test/common/upstream/outlier_detection_impl_test.cc b/test/common/upstream/outlier_detection_impl_test.cc index de6630f390866..e44c1f372bb8d 100644 --- a/test/common/upstream/outlier_detection_impl_test.cc +++ b/test/common/upstream/outlier_detection_impl_test.cc @@ -51,8 +51,22 @@ class CallbackChecker { class OutlierDetectorImplTest : public testing::Test { public: OutlierDetectorImplTest() { - ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing", 100)) + ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_consecutive_5xx", 100)) .WillByDefault(Return(true)); + ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_success_rate", 100)) + .WillByDefault(Return(true)); + } + + void loadRq(std::vector& hosts, int num_rq, int http_code) { + for (uint64_t i = 0; i < hosts.size(); i++) { + loadRq(hosts[i], num_rq, http_code); + } + } + + void loadRq(HostSharedPtr host, int num_rq, int http_code) { + for (int i = 0; i < num_rq; i++) { + host->outlierDetector().putHttpResponseCode(http_code); + } } NiceMock cluster_; @@ -72,7 +86,8 @@ TEST_F(OutlierDetectorImplTest, DetectorStaticConfig) { "base_ejection_time_ms" : 10000, "consecutive_5xx" : 10, "max_ejection_percent" : 50, - "enforcing" : 10 + "enforcing_consecutive_5xx" : 10, + "enforcing_success_rate": 20 } )EOF"; @@ -85,7 +100,8 @@ TEST_F(OutlierDetectorImplTest, DetectorStaticConfig) { EXPECT_EQ(10000UL, detector->config().baseEjectionTimeMs()); EXPECT_EQ(10UL, detector->config().consecutive5xx()); EXPECT_EQ(50UL, detector->config().maxEjectionPercent()); - EXPECT_EQ(10UL, detector->config().enforcing()); + EXPECT_EQ(10UL, detector->config().enforcingConsecutive5xx()); + EXPECT_EQ(20UL, detector->config().enforcingSuccessRate()); } TEST_F(OutlierDetectorImplTest, DestroyWithActive) { @@ -97,10 +113,7 @@ TEST_F(OutlierDetectorImplTest, DestroyWithActive) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostSharedPtr host) -> void { checker_.check(host); }); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, 503); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -108,7 +121,7 @@ TEST_F(OutlierDetectorImplTest, DestroyWithActive) { EXPECT_CALL(*event_logger_, logEject(std::static_pointer_cast(cluster_.hosts_[0]), EjectionType::Consecutive5xx)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, 503); EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); @@ -129,14 +142,10 @@ TEST_F(OutlierDetectorImplTest, DestroyHostInUse) { detector.reset(); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 5, 503); } -TEST_F(OutlierDetectorImplTest, BasicFlow) { +TEST_F(OutlierDetectorImplTest, BasicFlow5xx) { EXPECT_CALL(cluster_, addMemberUpdateCb(_)); cluster_.hosts_ = {HostSharedPtr{new HostImpl( cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}}; @@ -150,13 +159,10 @@ TEST_F(OutlierDetectorImplTest, BasicFlow) { cluster_.runCallbacks({cluster_.hosts_[1]}, {}); // Cause a consecutive 5xx error. - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(200); + loadRq(cluster_.hosts_[0], 1, 503); + loadRq(cluster_.hosts_[0], 1, 200); cluster_.hosts_[0]->outlierDetector().putResponseTime(std::chrono::milliseconds(5)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, 503); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -164,7 +170,7 @@ TEST_F(OutlierDetectorImplTest, BasicFlow) { EXPECT_CALL(*event_logger_, logEject(std::static_pointer_cast(cluster_.hosts_[0]), EjectionType::Consecutive5xx)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, 503); EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); @@ -185,14 +191,98 @@ TEST_F(OutlierDetectorImplTest, BasicFlow) { interval_timer_->callback_(); EXPECT_FALSE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + // Eject host again to cause an ejection after an unejection has taken place + loadRq(cluster_.hosts_[0], 1, 503); + loadRq(cluster_.hosts_[0], 1, 200); + cluster_.hosts_[0]->outlierDetector().putResponseTime(std::chrono::milliseconds(5)); + loadRq(cluster_.hosts_[0], 4, 503); + + EXPECT_CALL(time_source_, currentSystemTime()) + .WillOnce(Return(SystemTime(std::chrono::milliseconds(40000)))); + EXPECT_CALL(checker_, check(cluster_.hosts_[0])); + EXPECT_CALL(*event_logger_, + logEject(std::static_pointer_cast(cluster_.hosts_[0]), + EjectionType::Consecutive5xx)); + loadRq(cluster_.hosts_[0], 1, 503); + EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); + cluster_.runCallbacks({}, cluster_.hosts_); EXPECT_EQ(0UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); - EXPECT_EQ(1UL, cluster_.info_->stats_store_.counter("outlier_detection.ejections_total").value()); - EXPECT_EQ(1UL, cluster_.info_->stats_store_.counter("outlier_detection.ejections_consecutive_5xx") + EXPECT_EQ(2UL, cluster_.info_->stats_store_.counter("outlier_detection.ejections_total").value()); + EXPECT_EQ(2UL, cluster_.info_->stats_store_.counter("outlier_detection.ejections_consecutive_5xx") .value()); } +TEST_F(OutlierDetectorImplTest, BasicFlowSuccessRate) { + EXPECT_CALL(cluster_, addMemberUpdateCb(_)); + cluster_.hosts_ = { + HostSharedPtr{new HostImpl(cluster_.info_, "", + Network::Utility::resolveUrl("tcp://127.0.0.1:80"), false, 1, "")}, + HostSharedPtr{new HostImpl(cluster_.info_, "", + Network::Utility::resolveUrl("tcp://127.0.0.1:81"), false, 1, "")}, + HostSharedPtr{new HostImpl(cluster_.info_, "", + Network::Utility::resolveUrl("tcp://127.0.0.1:82"), false, 1, "")}, + HostSharedPtr{new HostImpl(cluster_.info_, "", + Network::Utility::resolveUrl("tcp://127.0.0.1:83"), false, 1, "")}, + HostSharedPtr{new HostImpl( + cluster_.info_, "", Network::Utility::resolveUrl("tcp://127.0.0.1:84"), false, 1, "")}}; + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); + std::shared_ptr detector( + DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); + detector->addChangedStateCb([&](HostSharedPtr host) -> void { checker_.check(host); }); + + // Turn off 5xx detection to test SR detection in isolation. + ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_consecutive_5xx", 100)) + .WillByDefault(Return(false)); + + // Cause a consecutive SR error on one host. First have 4 of the hosts have perfect SR. + loadRq(cluster_.hosts_, 200, 200); + loadRq(cluster_.hosts_[4], 200, 503); + + EXPECT_CALL(time_source_, currentSystemTime()) + .Times(2) + .WillRepeatedly(Return(SystemTime(std::chrono::milliseconds(10000)))); + EXPECT_CALL(checker_, check(cluster_.hosts_[4])); + EXPECT_CALL(*event_logger_, + logEject(std::static_pointer_cast(cluster_.hosts_[4]), + EjectionType::SuccessRate)); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); + interval_timer_->callback_(); + EXPECT_TRUE(cluster_.hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); + + // Interval that doesn't bring the host back in. + EXPECT_CALL(time_source_, currentSystemTime()) + .WillOnce(Return(SystemTime(std::chrono::milliseconds(19999)))); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); + interval_timer_->callback_(); + EXPECT_TRUE(cluster_.hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); + + // Interval that does bring the host back in. + EXPECT_CALL(time_source_, currentSystemTime()) + .WillOnce(Return(SystemTime(std::chrono::milliseconds(50001)))); + EXPECT_CALL(checker_, check(cluster_.hosts_[4])); + EXPECT_CALL(*event_logger_, + logUneject(std::static_pointer_cast(cluster_.hosts_[4]))); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); + interval_timer_->callback_(); + EXPECT_FALSE(cluster_.hosts_[4]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); + EXPECT_EQ(0UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); + + // Give 4 hosts enough request volume but not to the 5th. Should not cause an ejection. + loadRq(cluster_.hosts_, 25, 200); + loadRq(cluster_.hosts_[4], 25, 503); + + EXPECT_CALL(time_source_, currentSystemTime()) + .WillOnce(Return(SystemTime(std::chrono::milliseconds(60001)))); + EXPECT_CALL(*interval_timer_, enableTimer(std::chrono::milliseconds(10000))); + interval_timer_->callback_(); + EXPECT_EQ(0UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); +} + TEST_F(OutlierDetectorImplTest, RemoveWhileEjected) { EXPECT_CALL(cluster_, addMemberUpdateCb(_)); cluster_.hosts_ = {HostSharedPtr{new HostImpl( @@ -202,10 +292,7 @@ TEST_F(OutlierDetectorImplTest, RemoveWhileEjected) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostSharedPtr host) -> void { checker_.check(host); }); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, 503); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -213,7 +300,7 @@ TEST_F(OutlierDetectorImplTest, RemoveWhileEjected) { EXPECT_CALL(*event_logger_, logEject(std::static_pointer_cast(cluster_.hosts_[0]), EjectionType::Consecutive5xx)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, 503); EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); @@ -244,10 +331,7 @@ TEST_F(OutlierDetectorImplTest, Overflow) { ON_CALL(runtime_.snapshot_, getInteger("outlier_detection.max_ejection_percent", _)) .WillByDefault(Return(1)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, 503); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -258,11 +342,7 @@ TEST_F(OutlierDetectorImplTest, Overflow) { cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); - cluster_.hosts_[1]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[1]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[1]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[1]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[1]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[1], 5, 503); EXPECT_FALSE(cluster_.hosts_[1]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(1UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); @@ -279,14 +359,11 @@ TEST_F(OutlierDetectorImplTest, NotEnforcing) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostSharedPtr host) -> void { checker_.check(host); }); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, 503); - ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing", 100)) + ON_CALL(runtime_.snapshot_, featureEnabled("outlier_detection.enforcing_consecutive_5xx", 100)) .WillByDefault(Return(false)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, 503); EXPECT_FALSE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); EXPECT_EQ(0UL, cluster_.info_->stats_store_.gauge("outlier_detection.ejections_active").value()); @@ -304,14 +381,11 @@ TEST_F(OutlierDetectorImplTest, CrossThreadRemoveRace) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostSharedPtr host) -> void { checker_.check(host); }); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, 503); Event::PostCb post_cb; EXPECT_CALL(dispatcher_, post(_)).WillOnce(SaveArg<0>(&post_cb)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, 503); // Remove before the cross thread event comes in. std::vector old_hosts = std::move(cluster_.hosts_); @@ -330,14 +404,11 @@ TEST_F(OutlierDetectorImplTest, CrossThreadDestroyRace) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostSharedPtr host) -> void { checker_.check(host); }); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, 503); Event::PostCb post_cb; EXPECT_CALL(dispatcher_, post(_)).WillOnce(SaveArg<0>(&post_cb)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, 503); // Destroy before the cross thread event comes in. std::weak_ptr weak_detector = detector; @@ -357,14 +428,11 @@ TEST_F(OutlierDetectorImplTest, CrossThreadFailRace) { DetectorImpl::create(cluster_, *loader_, dispatcher_, runtime_, time_source_, event_logger_)); detector->addChangedStateCb([&](HostSharedPtr host) -> void { checker_.check(host); }); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, 503); Event::PostCb post_cb; EXPECT_CALL(dispatcher_, post(_)).WillOnce(SaveArg<0>(&post_cb)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, 503); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -391,10 +459,7 @@ TEST_F(OutlierDetectorImplTest, Consecutive5xxAlreadyEjected) { detector->addChangedStateCb([&](HostSharedPtr host) -> void { checker_.check(host); }); // Cause a consecutive 5xx error. - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 4, 503); EXPECT_CALL(time_source_, currentSystemTime()) .WillOnce(Return(SystemTime(std::chrono::milliseconds(0)))); @@ -402,16 +467,20 @@ TEST_F(OutlierDetectorImplTest, Consecutive5xxAlreadyEjected) { EXPECT_CALL(*event_logger_, logEject(std::static_pointer_cast(cluster_.hosts_[0]), EjectionType::Consecutive5xx)); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, 503); EXPECT_TRUE(cluster_.hosts_[0]->healthFlagGet(Host::HealthFlag::FAILED_OUTLIER_CHECK)); // Cause another consecutive 5xx error. - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(200); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); - cluster_.hosts_[0]->outlierDetector().putHttpResponseCode(503); + loadRq(cluster_.hosts_[0], 1, 200); + loadRq(cluster_.hosts_[0], 5, 503); +} + +TEST(DetectorHostSinkNullImplTest, All) { + DetectorHostSinkNullImpl null_sink; + + EXPECT_EQ(0UL, null_sink.numEjections()); + EXPECT_FALSE(null_sink.lastEjectionTime().valid()); + EXPECT_FALSE(null_sink.lastUnejectionTime().valid()); } TEST(OutlierDetectionEventLoggerImplTest, All) { @@ -453,9 +522,9 @@ TEST(OutlierDetectionEventLoggerImplTest, All) { EXPECT_CALL(*file, write("{\"time\": \"1970-01-01T00:00:00.000Z\", \"secs_since_last_action\": " "\"30\", \"cluster\": " "\"fake_cluster\", \"upstream_url\": \"10.0.0.1:443\", \"action\": " - "\"eject\", \"type\": \"5xx\", \"num_ejections\": 0}\n")) + "\"eject\", \"type\": \"SuccessRate\", \"num_ejections\": 0}\n")) .WillOnce(SaveArg<0>(&log3)); - event_logger.logEject(host, EjectionType::Consecutive5xx); + event_logger.logEject(host, EjectionType::SuccessRate); Json::Factory::LoadFromString(log3); std::string log4; @@ -468,5 +537,16 @@ TEST(OutlierDetectionEventLoggerImplTest, All) { Json::Factory::LoadFromString(log4); } +TEST(OutlierUtility, SRThreshold) { + std::vector data = { + HostSuccessRatePair(nullptr, 50), HostSuccessRatePair(nullptr, 100), + HostSuccessRatePair(nullptr, 100), HostSuccessRatePair(nullptr, 100), + HostSuccessRatePair(nullptr, 100), + }; + double sum = 450; + + EXPECT_EQ(Utility::successRateEjectionThreshold(sum, data), 52); +} + } // Outlier } // Upstream