diff --git a/api/envoy/service/health/v3/hds.proto b/api/envoy/service/health/v3/hds.proto index 41ed2040b1a73..a0dc8c66a910f 100644 --- a/api/envoy/service/health/v3/hds.proto +++ b/api/envoy/service/health/v3/hds.proto @@ -10,6 +10,7 @@ import "envoy/config/endpoint/v3/endpoint_components.proto"; import "google/api/annotations.proto"; import "google/protobuf/duration.proto"; +import "google/protobuf/struct.proto"; import "envoy/annotations/deprecation.proto"; import "udpa/annotations/status.proto"; @@ -109,6 +110,19 @@ message EndpointHealth { config.endpoint.v3.Endpoint endpoint = 1; config.core.v3.HealthStatus health_status = 2; + + // Optional metadata about the health check result, populated by the active + // health checker and forwarded to the management server for richer health + // state interpretation. + // + // Well-known keys: + // + // ``http_status_code`` (number) + // Set by the HTTP health checker. Contains the HTTP response status code + // returned by the upstream endpoint during the most recent health check, + // e.g. ``200``, ``503``. Only present when the health check received a + // complete HTTP response; absent on connection failures or timeouts. + google.protobuf.Struct health_metadata = 3; } // Group endpoint health by locality under each cluster. diff --git a/envoy/upstream/upstream.h b/envoy/upstream/upstream.h index 30d0675d4fe8d..99fdecb7ebf58 100644 --- a/envoy/upstream/upstream.h +++ b/envoy/upstream/upstream.h @@ -323,6 +323,19 @@ class Host : virtual public HostDescription { * Set true to disable active health check for the host. */ virtual void setDisableActiveHealthCheck(bool disable_active_health_check) PURE; + + /** + * Store the HTTP status code from the last active health check response. + * Used by HDS to report richer health state to the control plane. + * 0 means no response has been recorded yet. + */ + virtual void setLastHealthCheckHttpStatus(uint64_t) PURE; + + /** + * @return the HTTP status code from the last active health check response, or + * 0 if no response has been recorded. + */ + virtual absl::optional lastHealthCheckHttpStatus() const PURE; }; using HostConstSharedPtr = std::shared_ptr; diff --git a/source/common/upstream/health_discovery_service.cc b/source/common/upstream/health_discovery_service.cc index 9354ca79e5df6..649c78cb1a0c6 100644 --- a/source/common/upstream/health_discovery_service.cc +++ b/source/common/upstream/health_discovery_service.cc @@ -124,6 +124,14 @@ envoy::service::health::v3::HealthCheckRequestOrEndpointHealthResponse HdsDelega } } + // If a HTTP health check has run, attach the last response code to the + // HDS report so the control plane can interpret richer health states. + auto http_status = host->lastHealthCheckHttpStatus(); + if (http_status.has_value()) { + (*endpoint->mutable_health_metadata()->mutable_fields())["http_status_code"] + .set_number_value(http_status.value()); + } + // TODO(drewsortega): remove this once we are on v4 and endpoint_health_response is // removed. Copy this endpoint's health info to the legacy flat-list. response.mutable_endpoint_health_response()->add_endpoints_health()->MergeFrom(*endpoint); diff --git a/source/common/upstream/upstream_impl.h b/source/common/upstream/upstream_impl.h index ea42d74fcc351..768ee71fb43d1 100644 --- a/source/common/upstream/upstream_impl.h +++ b/source/common/upstream/upstream_impl.h @@ -383,6 +383,11 @@ class HostImplBase : public Host, uint32_t healthFlagsGetAll() const override { return health_flags_; } void healthFlagsSetAll(uint32_t bits) override { health_flags_ |= bits; } + void setLastHealthCheckHttpStatus(uint64_t status) override { last_hc_http_status_ = status; } + absl::optional lastHealthCheckHttpStatus() const override { + return last_hc_http_status_; + } + Host::HealthStatus healthStatus() const override { // Evaluate active health status first. @@ -471,6 +476,7 @@ class HostImplBase : public Host, // flag access? May be we could refactor HealthFlag to contain all these statuses and flags in the // future. std::atomic eds_health_status_{}; + absl::optional> last_hc_http_status_ = absl::nullopt; struct HostHandleImpl : HostHandle { HostHandleImpl(const std::shared_ptr& parent) : parent_(parent) { diff --git a/source/extensions/health_checkers/http/health_checker_impl.cc b/source/extensions/health_checkers/http/health_checker_impl.cc index 5c08c5e0d4083..8d6c8a281aecd 100644 --- a/source/extensions/health_checkers/http/health_checker_impl.cc +++ b/source/extensions/health_checkers/http/health_checker_impl.cc @@ -419,6 +419,11 @@ HttpHealthCheckerImpl::HttpActiveHealthCheckSession::healthCheckResult() { void HttpHealthCheckerImpl::HttpActiveHealthCheckSession::onResponseComplete() { request_in_flight_ = false; + // Store the raw HTTP response code on the host for HDS metadata reporting. + if (response_headers_ != nullptr) { + host_->setLastHealthCheckHttpStatus(Http::Utility::getResponseStatus(*response_headers_)); + } + switch (healthCheckResult()) { case HealthCheckResult::Succeeded: handleSuccess(false); diff --git a/test/common/upstream/hds_test.cc b/test/common/upstream/hds_test.cc index 2a30969a78de9..fd4b754108e05 100644 --- a/test/common/upstream/hds_test.cc +++ b/test/common/upstream/hds_test.cc @@ -1288,5 +1288,82 @@ TEST_F(HdsTest, TestCustomHealthCheckPortWhenUpdate) { } } +// Test that sendResponse includes health_metadata with http_status_code +// when setHealthCheckMetadata has been called on a host. +TEST_F(HdsTest, TestSendResponseWithHealthMetadata) { + EXPECT_CALL(*async_client_, startRaw(_, _, _, _)).WillOnce(Return(&async_stream_)); + EXPECT_CALL(async_stream_, sendMessageRaw_(_, _)); + createHdsDelegate(); + + // Create Message + message.reset(createSimpleMessage()); + + Network::MockClientConnection* connection_ = new NiceMock(); + EXPECT_CALL(server_context_.dispatcher_, createClientConnection_(_, _, _, _)) + .WillRepeatedly(Return(connection_)); + EXPECT_CALL(*server_response_timer_, enableTimer(_, _)).Times(2); + EXPECT_CALL(async_stream_, sendMessageRaw_(_, false)); + EXPECT_CALL(*test_factory_, createClusterInfo(_)).WillOnce(Return(cluster_info_)); + EXPECT_CALL(*connection_, setBufferLimits(_)); + EXPECT_CALL(server_context_.dispatcher_, deferredDelete_(_)); + + // Process message + hds_delegate_->onReceiveMessage(std::move(message)); + connection_->raiseEvent(Network::ConnectionEvent::Connected); + + // Set the HTTP response code on the host to simulate a health check result. + auto& host = hds_delegate_->hdsClusters()[0]->prioritySet().hostSetsPerPriority()[0]->hosts()[0]; + host->setLastHealthCheckHttpStatus(200); + + // Send Response + auto msg = hds_delegate_->sendResponse(); + + // Verify health_metadata contains the HTTP status code in the legacy flat list. + const auto& endpoint_health = msg.endpoint_health_response().endpoints_health(0); + ASSERT_TRUE(endpoint_health.has_health_metadata()); + const auto& fields = endpoint_health.health_metadata().fields(); + auto it = fields.find("http_status_code"); + ASSERT_NE(it, fields.end()); + EXPECT_EQ(it->second.number_value(), 200.0); + + // Also verify the structured (cluster-based) response. + const auto& cluster_endpoint = + msg.endpoint_health_response().cluster_endpoints_health(0).locality_endpoints_health(0); + const auto& structured_endpoint = cluster_endpoint.endpoints_health(0); + ASSERT_TRUE(structured_endpoint.has_health_metadata()); + EXPECT_EQ(structured_endpoint.health_metadata().fields().at("http_status_code").number_value(), + 200.0); +} + +// Test that sendResponse does not include health_metadata when no metadata +// has been set on the host (default empty Struct). +TEST_F(HdsTest, TestSendResponseNoHealthMetadataByDefault) { + EXPECT_CALL(*async_client_, startRaw(_, _, _, _)).WillOnce(Return(&async_stream_)); + EXPECT_CALL(async_stream_, sendMessageRaw_(_, _)); + createHdsDelegate(); + + // Create Message + message.reset(createSimpleMessage()); + + Network::MockClientConnection* connection_ = new NiceMock(); + EXPECT_CALL(server_context_.dispatcher_, createClientConnection_(_, _, _, _)) + .WillRepeatedly(Return(connection_)); + EXPECT_CALL(*server_response_timer_, enableTimer(_, _)).Times(2); + EXPECT_CALL(async_stream_, sendMessageRaw_(_, false)); + EXPECT_CALL(*test_factory_, createClusterInfo(_)).WillOnce(Return(cluster_info_)); + EXPECT_CALL(*connection_, setBufferLimits(_)); + EXPECT_CALL(server_context_.dispatcher_, deferredDelete_(_)); + + // Process message (do NOT set any health metadata on host) + hds_delegate_->onReceiveMessage(std::move(message)); + connection_->raiseEvent(Network::ConnectionEvent::Connected); + + // Send Response + auto msg = hds_delegate_->sendResponse(); + + // Verify health_metadata is absent when no metadata has been set. + EXPECT_FALSE(msg.endpoint_health_response().endpoints_health(0).has_health_metadata()); +} + } // namespace Upstream } // namespace Envoy diff --git a/test/common/upstream/health_checker_impl_test.cc b/test/common/upstream/health_checker_impl_test.cc index e1aaca4a4bcca..63aad863e1f90 100644 --- a/test/common/upstream/health_checker_impl_test.cc +++ b/test/common/upstream/health_checker_impl_test.cc @@ -899,6 +899,42 @@ TEST_F(HttpHealthCheckerImplTest, Success) { cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->coarseHealth()); } +// Verify that lastHealthCheckHttpStatus is recorded for a 200 response and +// updated on a subsequent 503. +TEST_F(HttpHealthCheckerImplTest, LastHealthCheckHttpStatusRecorded) { + setupNoServiceValidationHC(); + EXPECT_CALL(*this, onHostStatus(_, _)).Times(2); + + cluster_->prioritySet().getMockHostSet(0)->hosts_ = { + makeTestHost(cluster_->info_, "tcp://127.0.0.1:80")}; + cluster_->info_->trafficStats()->upstream_cx_total_.inc(); + expectSessionCreate(); + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + health_checker_->start(); + + EXPECT_CALL(runtime_.snapshot_, getInteger("health_check.max_interval", _)); + EXPECT_CALL(runtime_.snapshot_, getInteger("health_check.min_interval", _)) + .WillRepeatedly(Return(45000)); + EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(_, _)); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, disableTimer()); + respond(0, "200", false, false, true); + EXPECT_EQ(200U, + cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->lastHealthCheckHttpStatus()); + + // A second check with a 503 should overwrite the stored status. + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + expectStreamCreate(0); + test_sessions_[0]->interval_timer_->invokeCallback(); + EXPECT_CALL(runtime_.snapshot_, getInteger("health_check.max_interval", _)); + EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(_, _)); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, disableTimer()); + EXPECT_CALL(event_logger_, logEjectUnhealthy(_, _, _)); + respond(0, "503", false, false, true); + EXPECT_EQ(503U, + cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->lastHealthCheckHttpStatus()); +} + TEST_F(HttpHealthCheckerImplTest, Degraded) { setupNoServiceValidationHC(); EXPECT_CALL(*this, onHostStatus(_, HealthTransition::Changed)).Times(2); diff --git a/test/mocks/upstream/host.h b/test/mocks/upstream/host.h index 28bbd08f5d09a..1506abfbfae84 100644 --- a/test/mocks/upstream/host.h +++ b/test/mocks/upstream/host.h @@ -216,6 +216,8 @@ class MockHostLight : public Host { MOCK_METHOD(absl::optional, lastHcPassTime, (), (const)); MOCK_METHOD(void, setLbPolicyData, (HostLbPolicyDataPtr lb_policy_data)); MOCK_METHOD(OptRef, lbPolicyData, (), (const)); + MOCK_METHOD(void, setLastHealthCheckHttpStatus, (uint64_t)); + MOCK_METHOD(absl::optional, lastHealthCheckHttpStatus, (), (const)); bool disable_active_health_check_ = false; };