From c35f5ab707357310d1e64368a33e125671ee0cd0 Mon Sep 17 00:00:00 2001 From: James Forcier Date: Thu, 22 Aug 2019 20:47:44 +0000 Subject: [PATCH 01/10] upstream: Add ability to disable host selection during panic Previously, when in a panic state, requests would be routed to all hosts. In some cases it is instead preferable to not route any requests. Add a configuration option for zone-aware load balancers which switches from routing to all hosts to no hosts. Closes #7550. Signed-off-by: James Forcier --- api/envoy/api/v2/cds.proto | 4 ++ .../load_balancing/panic_threshold.rst | 12 ++-- docs/root/intro/version_history.rst | 1 + source/common/upstream/load_balancer_impl.cc | 23 ++++++-- source/common/upstream/load_balancer_impl.h | 7 ++- .../upstream/load_balancer_impl_test.cc | 59 +++++++++++++++++++ 6 files changed, 96 insertions(+), 10 deletions(-) diff --git a/api/envoy/api/v2/cds.proto b/api/envoy/api/v2/cds.proto index e7df4a940bc6a..d1a3af0af9baf 100644 --- a/api/envoy/api/v2/cds.proto +++ b/api/envoy/api/v2/cds.proto @@ -539,6 +539,10 @@ message Cluster { // * :ref:`runtime values `. // * :ref:`Zone aware routing support `. google.protobuf.UInt64Value min_cluster_size = 2; + + // If set to true, Envoy will not consider any hosts when the cluster is in panic mode. + // Instead, the cluster will fail all requests as if all hosts are unhealthy. + bool disable_cluster_on_panic = 3; } // Configuration for :ref:`locality weighted load balancing // ` diff --git a/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst b/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst index e24022a8f07b5..3c71b3a788c44 100644 --- a/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst +++ b/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst @@ -5,8 +5,8 @@ Panic threshold During load balancing, Envoy will generally only consider available (healthy or degraded) hosts in an upstream cluster. However, if the percentage of available hosts in the cluster becomes too low, -Envoy will disregard health status and balance amongst all hosts. This is known as the *panic -threshold*. The default panic threshold is 50%. This is +Envoy will disregard health status and balance either amongst all hosts or no hosts. This is known +as the *panic threshold*. The default panic threshold is 50%. This is :ref:`configurable ` via runtime as well as in the :ref:`cluster configuration `. The panic threshold is used to avoid a situation in which host failures cascade throughout the @@ -20,8 +20,12 @@ disregards panic thresholds and continues to distribute traffic load across prio the algorithm described :ref:`here `. However, when normalized total availability drops below 100%, Envoy assumes that there are not enough available hosts across all priority levels. It continues to distribute traffic load across priorities, -but if a given priority level's availability is below the panic threshold, traffic will go to all hosts -in that priority level regardless of their availability. +but if a given priority level's availability is below the panic threshold, traffic will go to all +(or no) hosts in that priority level regardless of their availability. + +There are two modes Envoy can choose from when in a panic state: traffic will either be sent to all +hosts, or will be sent to no hosts (and therefore will always fail). This is configured in the +:ref:`cluster configuration `. The following examples explain the relationship between normalized total availability and panic threshold. It is assumed that the default value of 50% is used for the panic threshold. diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index 609554b4d3745..bd89d2fa6fba5 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -43,6 +43,7 @@ Version history * upstream: added network filter chains to upstream connections, see :ref:`filters`. * upstream: use p2c to select hosts for least-requests load balancers if all host weights are the same, even in cases where weights are not equal to 1. * upstream: added :ref:`an option ` that allows draining HTTP, TCP connection pools on cluster membership change. +* upstream: add :ref:`disable_cluster_on_panic ` to allow failing all requests to a cluster during panic state * zookeeper: parse responses and emit latency stats. 1.11.1 (August 13, 2019) diff --git a/source/common/upstream/load_balancer_impl.cc b/source/common/upstream/load_balancer_impl.cc index 79bf87b79c445..5bc8781b3a068 100644 --- a/source/common/upstream/load_balancer_impl.cc +++ b/source/common/upstream/load_balancer_impl.cc @@ -282,7 +282,8 @@ ZoneAwareLoadBalancerBase::ZoneAwareLoadBalancerBase( routing_enabled_(PROTOBUF_PERCENT_TO_ROUNDED_INTEGER_OR_DEFAULT( common_config.zone_aware_lb_config(), routing_enabled, 100, 100)), min_cluster_size_(PROTOBUF_GET_WRAPPED_OR_DEFAULT(common_config.zone_aware_lb_config(), - min_cluster_size, 6U)) { + min_cluster_size, 6U)), + disable_cluster_on_panic_(common_config.zone_aware_lb_config().disable_cluster_on_panic()) { ASSERT(!priority_set.hostSetsPerPriority().empty()); resizePerPriorityState(); priority_set_.addPriorityUpdateCb( @@ -552,7 +553,11 @@ ZoneAwareLoadBalancerBase::hostSourceToUse(LoadBalancerContext* context) { // If the selected host set has insufficient healthy hosts, return all hosts. if (per_priority_panic_[hosts_source.priority_]) { stats_.lb_healthy_panic_.inc(); - hosts_source.source_type_ = HostsSource::SourceType::AllHosts; + if (disable_cluster_on_panic_) { + hosts_source.source_type_ = HostsSource::SourceType::NoHosts; + } else { + hosts_source.source_type_ = HostsSource::SourceType::AllHosts; + } return hosts_source; } @@ -586,9 +591,13 @@ ZoneAwareLoadBalancerBase::hostSourceToUse(LoadBalancerContext* context) { if (isGlobalPanic(localHostSet())) { stats_.lb_local_cluster_not_ok_.inc(); - // If the local Envoy instances are in global panic, do not do locality - // based routing. - hosts_source.source_type_ = sourceType(host_availability); + // If the local Envoy instances are in global panic, and we should not disable the cluster, do + // not do locality based routing. + if (disable_cluster_on_panic_) { + hosts_source.source_type_ = HostsSource::SourceType::NoHosts; + } else { + hosts_source.source_type_ = sourceType(host_availability); + } return hosts_source; } @@ -610,6 +619,8 @@ const HostVector& ZoneAwareLoadBalancerBase::hostSourceToHosts(HostsSource hosts return host_set.healthyHostsPerLocality().get()[hosts_source.locality_index_]; case HostsSource::SourceType::LocalityDegradedHosts: return host_set.degradedHostsPerLocality().get()[hosts_source.locality_index_]; + case HostsSource::SourceType::NoHosts: + return dummy_empty_host_vector; default: NOT_REACHED_GCOVR_EXCL_LINE; } @@ -696,6 +707,8 @@ void EdfLoadBalancerBase::refresh(uint32_t priority) { HostsSource(priority, HostsSource::SourceType::LocalityDegradedHosts, locality_index), host_set->degradedHostsPerLocality().get()[locality_index]); } + add_hosts_source(HostsSource(priority, HostsSource::SourceType::NoHosts), + dummy_empty_host_vector); } HostConstSharedPtr EdfLoadBalancerBase::chooseHostOnce(LoadBalancerContext* context) { diff --git a/source/common/upstream/load_balancer_impl.h b/source/common/upstream/load_balancer_impl.h index 98fd0c75d7d1e..6bc0fdd9074b3 100644 --- a/source/common/upstream/load_balancer_impl.h +++ b/source/common/upstream/load_balancer_impl.h @@ -180,6 +180,8 @@ class ZoneAwareLoadBalancerBase : public LoadBalancerBase { LocalityHealthyHosts, // Degraded hosts for locality @ locality_index. LocalityDegradedHosts, + // No hosts in the host set. + NoHosts, }; HostsSource() = default; @@ -187,7 +189,7 @@ class ZoneAwareLoadBalancerBase : public LoadBalancerBase { HostsSource(uint32_t priority, SourceType source_type) : priority_(priority), source_type_(source_type) { ASSERT(source_type == SourceType::AllHosts || source_type == SourceType::HealthyHosts || - source_type == SourceType::DegradedHosts); + source_type == SourceType::DegradedHosts || source_type == SourceType::NoHosts); } HostsSource(uint32_t priority, SourceType source_type, uint32_t locality_index) @@ -231,6 +233,8 @@ class ZoneAwareLoadBalancerBase : public LoadBalancerBase { */ const HostVector& hostSourceToHosts(HostsSource hosts_source); + const HostVector dummy_empty_host_vector; + private: enum class LocalityRoutingState { // Locality based routing is off. @@ -300,6 +304,7 @@ class ZoneAwareLoadBalancerBase : public LoadBalancerBase { const uint32_t routing_enabled_; const uint64_t min_cluster_size_; + const bool disable_cluster_on_panic_; struct PerPriorityState { // The percent of requests which can be routed to the local locality. diff --git a/test/common/upstream/load_balancer_impl_test.cc b/test/common/upstream/load_balancer_impl_test.cc index e29dd9b0dcd86..b6be73e9c04f5 100644 --- a/test/common/upstream/load_balancer_impl_test.cc +++ b/test/common/upstream/load_balancer_impl_test.cc @@ -539,6 +539,39 @@ TEST_P(FailoverTest, PriorityUpdatesWithLocalHostSet) { EXPECT_EQ(tertiary_host_set_.hosts_[0], lb_->chooseHost(nullptr)); } +// Test that extending the priority set with an existing LB causes the correct updates when the +// cluster is configured to disable on panic. +TEST_P(FailoverTest, PriorityUpdatesWithLocalHostSetDisableOnPanic) { + host_set_.hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:80")}; + failover_host_set_.hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:81")}; + common_config_.mutable_zone_aware_lb_config()->set_disable_cluster_on_panic(true); + + init(false); + // With both the primary and failover hosts unhealthy, we should select no host. + EXPECT_EQ(nullptr, lb_->chooseHost(nullptr)); + + // Update the priority set with a new priority level P=2 and ensure the host + // is chosen + MockHostSet& tertiary_host_set_ = *priority_set_.getMockHostSet(2); + HostVectorSharedPtr hosts(new HostVector({makeTestHost(info_, "tcp://127.0.0.1:82")})); + tertiary_host_set_.hosts_ = *hosts; + tertiary_host_set_.healthy_hosts_ = tertiary_host_set_.hosts_; + HostVector add_hosts; + add_hosts.push_back(tertiary_host_set_.hosts_[0]); + tertiary_host_set_.runCallbacks(add_hosts, {}); + EXPECT_EQ(tertiary_host_set_.hosts_[0], lb_->chooseHost(nullptr)); + + // Now add a healthy host in P=0 and make sure it is immediately selected. + host_set_.healthy_hosts_ = host_set_.hosts_; + host_set_.runCallbacks(add_hosts, {}); + EXPECT_EQ(host_set_.hosts_[0], lb_->chooseHost(nullptr)); + + // Remove the healthy host and ensure we fail back over to tertiary_host_set_ + host_set_.healthy_hosts_ = {}; + host_set_.runCallbacks({}, {}); + EXPECT_EQ(tertiary_host_set_.hosts_[0], lb_->chooseHost(nullptr)); +} + // Test extending the priority set. TEST_P(FailoverTest, ExtendPrioritiesUpdatingPrioritySet) { host_set_.hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:80")}; @@ -829,6 +862,32 @@ TEST_P(RoundRobinLoadBalancerTest, MaxUnhealthyPanic) { EXPECT_EQ(3UL, stats_.lb_healthy_panic_.value()); } +// Test that no hosts are selected when disable_cluster_on_panic is enabled. +TEST_P(RoundRobinLoadBalancerTest, MaxUnhealthyPanicDisableOnPanic) { + hostSet().healthy_hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:80"), + makeTestHost(info_, "tcp://127.0.0.1:81")}; + hostSet().hosts_ = { + makeTestHost(info_, "tcp://127.0.0.1:80"), makeTestHost(info_, "tcp://127.0.0.1:81"), + makeTestHost(info_, "tcp://127.0.0.1:82"), makeTestHost(info_, "tcp://127.0.0.1:83"), + makeTestHost(info_, "tcp://127.0.0.1:84"), makeTestHost(info_, "tcp://127.0.0.1:85")}; + + common_config_.mutable_zone_aware_lb_config()->set_disable_cluster_on_panic(true); + + init(false); + EXPECT_EQ(nullptr, lb_->chooseHost(nullptr)); + + // Take the threshold back above the panic threshold. + hostSet().healthy_hosts_ = { + makeTestHost(info_, "tcp://127.0.0.1:80"), makeTestHost(info_, "tcp://127.0.0.1:81"), + makeTestHost(info_, "tcp://127.0.0.1:82"), makeTestHost(info_, "tcp://127.0.0.1:83")}; + hostSet().runCallbacks({}, {}); + + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); + + EXPECT_EQ(1UL, stats_.lb_healthy_panic_.value()); +} + // Ensure if the panic threshold is 0%, panic mode is disabled. TEST_P(RoundRobinLoadBalancerTest, DisablePanicMode) { hostSet().healthy_hosts_ = {}; From 867d7c560a0e5a9ef8e867f501200443cfb62c31 Mon Sep 17 00:00:00 2001 From: James Forcier Date: Fri, 23 Aug 2019 17:23:43 +0000 Subject: [PATCH 02/10] Use optional return type for hostSourceToUse; drop dummy_empty_host_vector Signed-off-by: James Forcier --- source/common/upstream/load_balancer_impl.cc | 32 +++++++++++--------- source/common/upstream/load_balancer_impl.h | 8 ++--- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/source/common/upstream/load_balancer_impl.cc b/source/common/upstream/load_balancer_impl.cc index 5bc8781b3a068..4e3e284e483a6 100644 --- a/source/common/upstream/load_balancer_impl.cc +++ b/source/common/upstream/load_balancer_impl.cc @@ -540,7 +540,7 @@ uint32_t ZoneAwareLoadBalancerBase::tryChooseLocalLocalityHosts(const HostSet& h return i; } -ZoneAwareLoadBalancerBase::HostsSource +absl::optional ZoneAwareLoadBalancerBase::hostSourceToUse(LoadBalancerContext* context) { auto host_set_and_source = chooseHostSet(context); @@ -554,11 +554,11 @@ ZoneAwareLoadBalancerBase::hostSourceToUse(LoadBalancerContext* context) { if (per_priority_panic_[hosts_source.priority_]) { stats_.lb_healthy_panic_.inc(); if (disable_cluster_on_panic_) { - hosts_source.source_type_ = HostsSource::SourceType::NoHosts; + return absl::nullopt; } else { hosts_source.source_type_ = HostsSource::SourceType::AllHosts; + return hosts_source; } - return hosts_source; } // If we're doing locality weighted balancing, pick locality. @@ -594,11 +594,11 @@ ZoneAwareLoadBalancerBase::hostSourceToUse(LoadBalancerContext* context) { // If the local Envoy instances are in global panic, and we should not disable the cluster, do // not do locality based routing. if (disable_cluster_on_panic_) { - hosts_source.source_type_ = HostsSource::SourceType::NoHosts; + return absl::nullopt; } else { hosts_source.source_type_ = sourceType(host_availability); + return hosts_source; } - return hosts_source; } hosts_source.source_type_ = localitySourceType(host_availability); @@ -619,8 +619,6 @@ const HostVector& ZoneAwareLoadBalancerBase::hostSourceToHosts(HostsSource hosts return host_set.healthyHostsPerLocality().get()[hosts_source.locality_index_]; case HostsSource::SourceType::LocalityDegradedHosts: return host_set.degradedHostsPerLocality().get()[hosts_source.locality_index_]; - case HostsSource::SourceType::NoHosts: - return dummy_empty_host_vector; default: NOT_REACHED_GCOVR_EXCL_LINE; } @@ -707,13 +705,14 @@ void EdfLoadBalancerBase::refresh(uint32_t priority) { HostsSource(priority, HostsSource::SourceType::LocalityDegradedHosts, locality_index), host_set->degradedHostsPerLocality().get()[locality_index]); } - add_hosts_source(HostsSource(priority, HostsSource::SourceType::NoHosts), - dummy_empty_host_vector); } HostConstSharedPtr EdfLoadBalancerBase::chooseHostOnce(LoadBalancerContext* context) { - const HostsSource hosts_source = hostSourceToUse(context); - auto scheduler_it = scheduler_.find(hosts_source); + const absl::optional hosts_source = hostSourceToUse(context); + if (!hosts_source) { + return nullptr; + } + auto scheduler_it = scheduler_.find(*hosts_source); // We should always have a scheduler for any return value from // hostSourceToUse() via the construction in refresh(); ASSERT(scheduler_it != scheduler_.end()); @@ -730,11 +729,11 @@ HostConstSharedPtr EdfLoadBalancerBase::chooseHostOnce(LoadBalancerContext* cont } return host; } else { - const HostVector& hosts_to_use = hostSourceToHosts(hosts_source); + const HostVector& hosts_to_use = hostSourceToHosts(*hosts_source); if (hosts_to_use.empty()) { return nullptr; } - return unweightedHostPick(hosts_to_use, hosts_source); + return unweightedHostPick(hosts_to_use, *hosts_source); } } @@ -762,7 +761,12 @@ HostConstSharedPtr LeastRequestLoadBalancer::unweightedHostPick(const HostVector } HostConstSharedPtr RandomLoadBalancer::chooseHostOnce(LoadBalancerContext* context) { - const HostVector& hosts_to_use = hostSourceToHosts(hostSourceToUse(context)); + const absl::optional hosts_source = hostSourceToUse(context); + if (!hosts_source) { + return nullptr; + } + + const HostVector& hosts_to_use = hostSourceToHosts(*hosts_source); if (hosts_to_use.empty()) { return nullptr; } diff --git a/source/common/upstream/load_balancer_impl.h b/source/common/upstream/load_balancer_impl.h index 6bc0fdd9074b3..fc49931136c81 100644 --- a/source/common/upstream/load_balancer_impl.h +++ b/source/common/upstream/load_balancer_impl.h @@ -180,8 +180,6 @@ class ZoneAwareLoadBalancerBase : public LoadBalancerBase { LocalityHealthyHosts, // Degraded hosts for locality @ locality_index. LocalityDegradedHosts, - // No hosts in the host set. - NoHosts, }; HostsSource() = default; @@ -189,7 +187,7 @@ class ZoneAwareLoadBalancerBase : public LoadBalancerBase { HostsSource(uint32_t priority, SourceType source_type) : priority_(priority), source_type_(source_type) { ASSERT(source_type == SourceType::AllHosts || source_type == SourceType::HealthyHosts || - source_type == SourceType::DegradedHosts || source_type == SourceType::NoHosts); + source_type == SourceType::DegradedHosts); } HostsSource(uint32_t priority, SourceType source_type, uint32_t locality_index) @@ -226,15 +224,13 @@ class ZoneAwareLoadBalancerBase : public LoadBalancerBase { /** * Pick the host source to use, doing zone aware routing when the hosts are sufficiently healthy. */ - HostsSource hostSourceToUse(LoadBalancerContext* context); + absl::optional hostSourceToUse(LoadBalancerContext* context); /** * Index into priority_set via hosts source descriptor. */ const HostVector& hostSourceToHosts(HostsSource hosts_source); - const HostVector dummy_empty_host_vector; - private: enum class LocalityRoutingState { // Locality based routing is off. From d1f513f010db4dc33876aa779eaffa1a3e14490f Mon Sep 17 00:00:00 2001 From: James Forcier Date: Mon, 26 Aug 2019 18:26:52 +0000 Subject: [PATCH 03/10] Address review nits Signed-off-by: James Forcier --- api/envoy/api/v2/cds.proto | 2 +- .../upstream/load_balancing/panic_threshold.rst | 8 ++++---- docs/root/intro/version_history.rst | 2 +- source/common/upstream/load_balancer_impl.cc | 8 ++++---- source/common/upstream/load_balancer_impl.h | 2 +- test/common/upstream/load_balancer_impl_test.cc | 6 +++--- 6 files changed, 14 insertions(+), 14 deletions(-) diff --git a/api/envoy/api/v2/cds.proto b/api/envoy/api/v2/cds.proto index d1a3af0af9baf..bfdd3c9bc69a9 100644 --- a/api/envoy/api/v2/cds.proto +++ b/api/envoy/api/v2/cds.proto @@ -542,7 +542,7 @@ message Cluster { // If set to true, Envoy will not consider any hosts when the cluster is in panic mode. // Instead, the cluster will fail all requests as if all hosts are unhealthy. - bool disable_cluster_on_panic = 3; + bool fail_traffic_on_panic = 3; } // Configuration for :ref:`locality weighted load balancing // ` diff --git a/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst b/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst index 3c71b3a788c44..e3222e7307c81 100644 --- a/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst +++ b/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst @@ -12,6 +12,10 @@ as the *panic threshold*. The default panic threshold is 50%. This is The panic threshold is used to avoid a situation in which host failures cascade throughout the cluster as load increases. +There are two modes Envoy can choose from when in a panic state: traffic will either be sent to all +hosts, or will be sent to no hosts (and therefore will always fail). This is configured in the +:ref:`cluster configuration `. + Panic thresholds work in conjunction with priorities. If the number of available hosts in a given priority goes down, Envoy will try to shift some traffic to lower priorities. If it succeeds in finding enough available hosts in lower priorities, Envoy will disregard panic thresholds. In @@ -23,10 +27,6 @@ available hosts across all priority levels. It continues to distribute traffic l but if a given priority level's availability is below the panic threshold, traffic will go to all (or no) hosts in that priority level regardless of their availability. -There are two modes Envoy can choose from when in a panic state: traffic will either be sent to all -hosts, or will be sent to no hosts (and therefore will always fail). This is configured in the -:ref:`cluster configuration `. - The following examples explain the relationship between normalized total availability and panic threshold. It is assumed that the default value of 50% is used for the panic threshold. diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index bd89d2fa6fba5..f54cb971ee7e9 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -43,7 +43,7 @@ Version history * upstream: added network filter chains to upstream connections, see :ref:`filters`. * upstream: use p2c to select hosts for least-requests load balancers if all host weights are the same, even in cases where weights are not equal to 1. * upstream: added :ref:`an option ` that allows draining HTTP, TCP connection pools on cluster membership change. -* upstream: add :ref:`disable_cluster_on_panic ` to allow failing all requests to a cluster during panic state +* upstream: add :ref:`fail_traffic_on_panic ` to allow failing all requests to a cluster during panic state. * zookeeper: parse responses and emit latency stats. 1.11.1 (August 13, 2019) diff --git a/source/common/upstream/load_balancer_impl.cc b/source/common/upstream/load_balancer_impl.cc index 4e3e284e483a6..b2b61d32f3767 100644 --- a/source/common/upstream/load_balancer_impl.cc +++ b/source/common/upstream/load_balancer_impl.cc @@ -283,7 +283,7 @@ ZoneAwareLoadBalancerBase::ZoneAwareLoadBalancerBase( common_config.zone_aware_lb_config(), routing_enabled, 100, 100)), min_cluster_size_(PROTOBUF_GET_WRAPPED_OR_DEFAULT(common_config.zone_aware_lb_config(), min_cluster_size, 6U)), - disable_cluster_on_panic_(common_config.zone_aware_lb_config().disable_cluster_on_panic()) { + fail_traffic_on_panic_(common_config.zone_aware_lb_config().fail_traffic_on_panic()) { ASSERT(!priority_set.hostSetsPerPriority().empty()); resizePerPriorityState(); priority_set_.addPriorityUpdateCb( @@ -553,7 +553,7 @@ ZoneAwareLoadBalancerBase::hostSourceToUse(LoadBalancerContext* context) { // If the selected host set has insufficient healthy hosts, return all hosts. if (per_priority_panic_[hosts_source.priority_]) { stats_.lb_healthy_panic_.inc(); - if (disable_cluster_on_panic_) { + if (fail_traffic_on_panic_) { return absl::nullopt; } else { hosts_source.source_type_ = HostsSource::SourceType::AllHosts; @@ -591,9 +591,9 @@ ZoneAwareLoadBalancerBase::hostSourceToUse(LoadBalancerContext* context) { if (isGlobalPanic(localHostSet())) { stats_.lb_local_cluster_not_ok_.inc(); - // If the local Envoy instances are in global panic, and we should not disable the cluster, do + // If the local Envoy instances are in global panic, and we should not fail traffic, do // not do locality based routing. - if (disable_cluster_on_panic_) { + if (fail_traffic_on_panic_) { return absl::nullopt; } else { hosts_source.source_type_ = sourceType(host_availability); diff --git a/source/common/upstream/load_balancer_impl.h b/source/common/upstream/load_balancer_impl.h index fc49931136c81..41e141ad3111d 100644 --- a/source/common/upstream/load_balancer_impl.h +++ b/source/common/upstream/load_balancer_impl.h @@ -300,7 +300,7 @@ class ZoneAwareLoadBalancerBase : public LoadBalancerBase { const uint32_t routing_enabled_; const uint64_t min_cluster_size_; - const bool disable_cluster_on_panic_; + const bool fail_traffic_on_panic_; struct PerPriorityState { // The percent of requests which can be routed to the local locality. diff --git a/test/common/upstream/load_balancer_impl_test.cc b/test/common/upstream/load_balancer_impl_test.cc index b6be73e9c04f5..efaf49b2f1087 100644 --- a/test/common/upstream/load_balancer_impl_test.cc +++ b/test/common/upstream/load_balancer_impl_test.cc @@ -544,7 +544,7 @@ TEST_P(FailoverTest, PriorityUpdatesWithLocalHostSet) { TEST_P(FailoverTest, PriorityUpdatesWithLocalHostSetDisableOnPanic) { host_set_.hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:80")}; failover_host_set_.hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:81")}; - common_config_.mutable_zone_aware_lb_config()->set_disable_cluster_on_panic(true); + common_config_.mutable_zone_aware_lb_config()->set_fail_traffic_on_panic(true); init(false); // With both the primary and failover hosts unhealthy, we should select no host. @@ -862,7 +862,7 @@ TEST_P(RoundRobinLoadBalancerTest, MaxUnhealthyPanic) { EXPECT_EQ(3UL, stats_.lb_healthy_panic_.value()); } -// Test that no hosts are selected when disable_cluster_on_panic is enabled. +// Test that no hosts are selected when fail_cluster_on_panic is enabled. TEST_P(RoundRobinLoadBalancerTest, MaxUnhealthyPanicDisableOnPanic) { hostSet().healthy_hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:80"), makeTestHost(info_, "tcp://127.0.0.1:81")}; @@ -871,7 +871,7 @@ TEST_P(RoundRobinLoadBalancerTest, MaxUnhealthyPanicDisableOnPanic) { makeTestHost(info_, "tcp://127.0.0.1:82"), makeTestHost(info_, "tcp://127.0.0.1:83"), makeTestHost(info_, "tcp://127.0.0.1:84"), makeTestHost(info_, "tcp://127.0.0.1:85")}; - common_config_.mutable_zone_aware_lb_config()->set_disable_cluster_on_panic(true); + common_config_.mutable_zone_aware_lb_config()->set_fail_cluster_on_panic(true); init(false); EXPECT_EQ(nullptr, lb_->chooseHost(nullptr)); From fbce2af6c5d4c4d5e417cb7447be55494fa74f5d Mon Sep 17 00:00:00 2001 From: James Forcier Date: Mon, 26 Aug 2019 18:45:38 +0000 Subject: [PATCH 04/10] Fix incorrect renamed field Signed-off-by: James Forcier --- test/common/upstream/load_balancer_impl_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/common/upstream/load_balancer_impl_test.cc b/test/common/upstream/load_balancer_impl_test.cc index efaf49b2f1087..49e57e913feca 100644 --- a/test/common/upstream/load_balancer_impl_test.cc +++ b/test/common/upstream/load_balancer_impl_test.cc @@ -862,7 +862,7 @@ TEST_P(RoundRobinLoadBalancerTest, MaxUnhealthyPanic) { EXPECT_EQ(3UL, stats_.lb_healthy_panic_.value()); } -// Test that no hosts are selected when fail_cluster_on_panic is enabled. +// Test that no hosts are selected when fail_traffic_on_panic is enabled. TEST_P(RoundRobinLoadBalancerTest, MaxUnhealthyPanicDisableOnPanic) { hostSet().healthy_hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:80"), makeTestHost(info_, "tcp://127.0.0.1:81")}; @@ -871,7 +871,7 @@ TEST_P(RoundRobinLoadBalancerTest, MaxUnhealthyPanicDisableOnPanic) { makeTestHost(info_, "tcp://127.0.0.1:82"), makeTestHost(info_, "tcp://127.0.0.1:83"), makeTestHost(info_, "tcp://127.0.0.1:84"), makeTestHost(info_, "tcp://127.0.0.1:85")}; - common_config_.mutable_zone_aware_lb_config()->set_fail_cluster_on_panic(true); + common_config_.mutable_zone_aware_lb_config()->set_fail_traffic_on_panic(true); init(false); EXPECT_EQ(nullptr, lb_->chooseHost(nullptr)); From 5a9321510015df4daa9db835745510b36179da5a Mon Sep 17 00:00:00 2001 From: James Forcier Date: Tue, 27 Aug 2019 13:53:03 +0000 Subject: [PATCH 05/10] Add note about motivation to docs Signed-off-by: James Forcier --- api/envoy/api/v2/cds.proto | 3 ++- .../arch_overview/upstream/load_balancing/panic_threshold.rst | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/api/envoy/api/v2/cds.proto b/api/envoy/api/v2/cds.proto index bfdd3c9bc69a9..18bc9b7c7ced9 100644 --- a/api/envoy/api/v2/cds.proto +++ b/api/envoy/api/v2/cds.proto @@ -541,7 +541,8 @@ message Cluster { google.protobuf.UInt64Value min_cluster_size = 2; // If set to true, Envoy will not consider any hosts when the cluster is in panic mode. - // Instead, the cluster will fail all requests as if all hosts are unhealthy. + // Instead, the cluster will fail all requests as if all hosts are unhealthy. This can help + // avoid potentially overwhelming a failing service. bool fail_traffic_on_panic = 3; } // Configuration for :ref:`locality weighted load balancing diff --git a/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst b/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst index e3222e7307c81..92b1b717f99ee 100644 --- a/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst +++ b/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst @@ -15,6 +15,8 @@ cluster as load increases. There are two modes Envoy can choose from when in a panic state: traffic will either be sent to all hosts, or will be sent to no hosts (and therefore will always fail). This is configured in the :ref:`cluster configuration `. +Choosing to fail traffic during panic scenarios can help avoid overwhelming potentially failing +upstream services. Panic thresholds work in conjunction with priorities. If the number of available hosts in a given priority goes down, Envoy will try to shift some traffic to lower priorities. If it succeeds in From 668f72919e1b55d37da854321ef84ade9f91b5fb Mon Sep 17 00:00:00 2001 From: James Forcier Date: Wed, 28 Aug 2019 20:19:51 +0000 Subject: [PATCH 06/10] Address wording nit in version history Signed-off-by: James Forcier --- docs/root/intro/version_history.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index f54cb971ee7e9..1f195591e981a 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -43,7 +43,7 @@ Version history * upstream: added network filter chains to upstream connections, see :ref:`filters`. * upstream: use p2c to select hosts for least-requests load balancers if all host weights are the same, even in cases where weights are not equal to 1. * upstream: added :ref:`an option ` that allows draining HTTP, TCP connection pools on cluster membership change. -* upstream: add :ref:`fail_traffic_on_panic ` to allow failing all requests to a cluster during panic state. +* upstream: added :ref:`fail_traffic_on_panic ` to allow failing all requests to a cluster during panic state. * zookeeper: parse responses and emit latency stats. 1.11.1 (August 13, 2019) From c45f03d835880de977116ffddae4d93b12ae6b10 Mon Sep 17 00:00:00 2001 From: James Forcier Date: Wed, 4 Sep 2019 16:36:31 +0000 Subject: [PATCH 07/10] Add additional explanation on useful scenarios for this option Signed-off-by: James Forcier --- .../upstream/load_balancing/panic_threshold.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst b/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst index 92b1b717f99ee..864ad11171e6f 100644 --- a/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst +++ b/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst @@ -16,7 +16,12 @@ There are two modes Envoy can choose from when in a panic state: traffic will ei hosts, or will be sent to no hosts (and therefore will always fail). This is configured in the :ref:`cluster configuration `. Choosing to fail traffic during panic scenarios can help avoid overwhelming potentially failing -upstream services. +upstream services, as it will reduce the load on the upstream service before all hosts have been +determined to be unhealthy. However, it eliminates the possibility of _some_ requests succeeding +even when many or all hosts in a cluster are unhealthy. This may be a good tradeoff to make if a +given service is observed to fail in an all-or-nothing pattern, as it will more quickly cut off +requests to the cluster. Conversely, if a cluster typically continues to successfully service _some_ +requests even when degraded, enabling this option is probably unhelpful. Panic thresholds work in conjunction with priorities. If the number of available hosts in a given priority goes down, Envoy will try to shift some traffic to lower priorities. If it succeeds in From 095be1b6ae85772195ff8715d2560c5598ea4748 Mon Sep 17 00:00:00 2001 From: James Forcier Date: Thu, 5 Sep 2019 18:56:51 +0000 Subject: [PATCH 08/10] Address bad merge/comment nits Signed-off-by: James Forcier --- docs/root/intro/version_history.rst | 1 - source/common/upstream/load_balancer_impl.cc | 3 ++- source/common/upstream/load_balancer_impl.h | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index 87fb2d9da4790..ee1988bd205e1 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -56,7 +56,6 @@ Version history * upstream: added :ref:`an option ` that allows draining HTTP, TCP connection pools on cluster membership change. * upstream: added network filter chains to upstream connections, see :ref:`filters`. * upstream: use p2c to select hosts for least-requests load balancers if all host weights are the same, even in cases where weights are not equal to 1. -* upstream: added :ref:`an option ` that allows draining HTTP, TCP connection pools on cluster membership change. * upstream: added :ref:`fail_traffic_on_panic ` to allow failing all requests to a cluster during panic state. * zookeeper: parse responses and emit latency stats. diff --git a/source/common/upstream/load_balancer_impl.cc b/source/common/upstream/load_balancer_impl.cc index b2b61d32f3767..b3172d9c9a271 100644 --- a/source/common/upstream/load_balancer_impl.cc +++ b/source/common/upstream/load_balancer_impl.cc @@ -550,7 +550,8 @@ ZoneAwareLoadBalancerBase::hostSourceToUse(LoadBalancerContext* context) { HostsSource hosts_source; hosts_source.priority_ = host_set.priority(); - // If the selected host set has insufficient healthy hosts, return all hosts. + // If the selected host set has insufficient healthy hosts, return all hosts (unless we should + // fail traffic on panic, in which case return no host). if (per_priority_panic_[hosts_source.priority_]) { stats_.lb_healthy_panic_.inc(); if (fail_traffic_on_panic_) { diff --git a/source/common/upstream/load_balancer_impl.h b/source/common/upstream/load_balancer_impl.h index 41e141ad3111d..988955f359ab0 100644 --- a/source/common/upstream/load_balancer_impl.h +++ b/source/common/upstream/load_balancer_impl.h @@ -223,6 +223,7 @@ class ZoneAwareLoadBalancerBase : public LoadBalancerBase { /** * Pick the host source to use, doing zone aware routing when the hosts are sufficiently healthy. + * If no host is chosen (due to fail_traffic_on_panic being set), return absl::nullopt. */ absl::optional hostSourceToUse(LoadBalancerContext* context); From b7a649c48f82adc3933be13d36a04abfc5e40ef6 Mon Sep 17 00:00:00 2001 From: James Forcier Date: Thu, 5 Sep 2019 20:47:46 +0000 Subject: [PATCH 09/10] Add new unit tests to cover missed paths Signed-off-by: James Forcier --- .../upstream/load_balancer_impl_test.cc | 63 +++++++++++++++++-- 1 file changed, 59 insertions(+), 4 deletions(-) diff --git a/test/common/upstream/load_balancer_impl_test.cc b/test/common/upstream/load_balancer_impl_test.cc index 49e57e913feca..a2e7366f5ece4 100644 --- a/test/common/upstream/load_balancer_impl_test.cc +++ b/test/common/upstream/load_balancer_impl_test.cc @@ -1236,6 +1236,41 @@ TEST_P(RoundRobinLoadBalancerTest, NoZoneAwareRoutingLocalEmpty) { EXPECT_EQ(1U, stats_.lb_local_cluster_not_ok_.value()); } +TEST_P(RoundRobinLoadBalancerTest, NoZoneAwareRoutingLocalEmptyFailTrafficOnPanic) { + common_config_.mutable_zone_aware_lb_config()->set_fail_traffic_on_panic(true); + + if (&hostSet() == &failover_host_set_) { // P = 1 does not support zone-aware routing. + return; + } + HostVectorSharedPtr upstream_hosts(new HostVector( + {makeTestHost(info_, "tcp://127.0.0.1:80"), makeTestHost(info_, "tcp://127.0.0.1:81")})); + HostVectorSharedPtr local_hosts(new HostVector({}, {})); + + HostsPerLocalitySharedPtr upstream_hosts_per_locality = makeHostsPerLocality( + {{makeTestHost(info_, "tcp://127.0.0.1:80")}, {makeTestHost(info_, "tcp://127.0.0.1:81")}}); + HostsPerLocalitySharedPtr local_hosts_per_locality = makeHostsPerLocality({{}, {}}); + + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.healthy_panic_threshold", 50)) + .WillOnce(Return(50)) + .WillOnce(Return(50)); + EXPECT_CALL(runtime_.snapshot_, featureEnabled("upstream.zone_routing.enabled", 100)) + .WillOnce(Return(true)); + EXPECT_CALL(runtime_.snapshot_, getInteger("upstream.zone_routing.min_cluster_size", 6)) + .WillOnce(Return(1)); + + hostSet().healthy_hosts_ = *upstream_hosts; + hostSet().hosts_ = *upstream_hosts; + hostSet().healthy_hosts_per_locality_ = upstream_hosts_per_locality; + init(true); + updateHosts(local_hosts, local_hosts_per_locality); + + // Local cluster is not OK, we'll do regular routing (and select no host, since we're in global + // panic). + EXPECT_EQ(nullptr, lb_->chooseHost(nullptr)); + EXPECT_EQ(0U, stats_.lb_healthy_panic_.value()); + EXPECT_EQ(1U, stats_.lb_local_cluster_not_ok_.value()); +} + // Validate that if we have healthy host lists >= 2, but there is no local // locality included, that we skip zone aware routing and fallback. TEST_P(RoundRobinLoadBalancerTest, NoZoneAwareRoutingNoLocalLocality) { @@ -1447,20 +1482,40 @@ INSTANTIATE_TEST_SUITE_P(PrimaryOrFailover, LeastRequestLoadBalancerTest, class RandomLoadBalancerTest : public LoadBalancerTestBase { public: - RandomLoadBalancer lb_{priority_set_, nullptr, stats_, runtime_, random_, common_config_}; + void init() { + lb_.reset( + new RandomLoadBalancer(priority_set_, nullptr, stats_, runtime_, random_, common_config_)); + } + std::shared_ptr lb_; }; -TEST_P(RandomLoadBalancerTest, NoHosts) { EXPECT_EQ(nullptr, lb_.chooseHost(nullptr)); } +TEST_P(RandomLoadBalancerTest, NoHosts) { + init(); + EXPECT_EQ(nullptr, lb_->chooseHost(nullptr)); +} TEST_P(RandomLoadBalancerTest, Normal) { + init(); + hostSet().healthy_hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:80"), makeTestHost(info_, "tcp://127.0.0.1:81")}; hostSet().hosts_ = hostSet().healthy_hosts_; hostSet().runCallbacks({}, {}); // Trigger callbacks. The added/removed lists are not relevant. EXPECT_CALL(random_, random()).WillOnce(Return(0)).WillOnce(Return(2)); - EXPECT_EQ(hostSet().healthy_hosts_[0], lb_.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); EXPECT_CALL(random_, random()).WillOnce(Return(0)).WillOnce(Return(3)); - EXPECT_EQ(hostSet().healthy_hosts_[1], lb_.chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); +} + +TEST_P(RandomLoadBalancerTest, FailClusterOnPanic) { + common_config_.mutable_zone_aware_lb_config()->set_fail_traffic_on_panic(true); + init(); + + hostSet().healthy_hosts_ = {}; + hostSet().hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:80"), + makeTestHost(info_, "tcp://127.0.0.1:81")}; + hostSet().runCallbacks({}, {}); // Trigger callbacks. The added/removed lists are not relevant. + EXPECT_EQ(nullptr, lb_->chooseHost(nullptr)); } INSTANTIATE_TEST_SUITE_P(PrimaryOrFailover, RandomLoadBalancerTest, ::testing::Values(true, false)); From bc5410348153e40b9defee28fa6a6f3df4892879 Mon Sep 17 00:00:00 2001 From: James Forcier Date: Tue, 10 Sep 2019 18:28:50 +0000 Subject: [PATCH 10/10] Add reflink to panic mode docs from API doc comment Signed-off-by: James Forcier --- api/envoy/api/v2/cds.proto | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/api/envoy/api/v2/cds.proto b/api/envoy/api/v2/cds.proto index 69241e87cf74d..e444393b19119 100644 --- a/api/envoy/api/v2/cds.proto +++ b/api/envoy/api/v2/cds.proto @@ -540,9 +540,10 @@ message Cluster { // * :ref:`Zone aware routing support `. google.protobuf.UInt64Value min_cluster_size = 2; - // If set to true, Envoy will not consider any hosts when the cluster is in panic mode. - // Instead, the cluster will fail all requests as if all hosts are unhealthy. This can help - // avoid potentially overwhelming a failing service. + // If set to true, Envoy will not consider any hosts when the cluster is in :ref:`panic + // mode`. Instead, the cluster will fail all + // requests as if all hosts are unhealthy. This can help avoid potentially overwhelming a + // failing service. bool fail_traffic_on_panic = 3; } // Configuration for :ref:`locality weighted load balancing