Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions api/envoy/service/health/v3/hds.proto
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import "envoy/config/endpoint/v3/endpoint_components.proto";

import "google/api/annotations.proto";
import "google/protobuf/duration.proto";
import "google/protobuf/struct.proto";

import "envoy/annotations/deprecation.proto";
import "udpa/annotations/status.proto";
Expand Down Expand Up @@ -109,6 +110,19 @@ message EndpointHealth {
config.endpoint.v3.Endpoint endpoint = 1;

config.core.v3.HealthStatus health_status = 2;

// Optional metadata about the health check result, populated by the active
// health checker and forwarded to the management server for richer health
// state interpretation.
//
// Well-known keys:
//
// ``http_status_code`` (number)
// Set by the HTTP health checker. Contains the HTTP response status code
// returned by the upstream endpoint during the most recent health check,
// e.g. ``200``, ``503``. Only present when the health check received a
// complete HTTP response; absent on connection failures or timeouts.
google.protobuf.Struct health_metadata = 3;
}

// Group endpoint health by locality under each cluster.
Expand Down
13 changes: 13 additions & 0 deletions envoy/upstream/upstream.h
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,19 @@ class Host : virtual public HostDescription {
* Set true to disable active health check for the host.
*/
virtual void setDisableActiveHealthCheck(bool disable_active_health_check) PURE;

/**
* Store the HTTP status code from the last active health check response.
* Used by HDS to report richer health state to the control plane.
* 0 means no response has been recorded yet.
*/
virtual void setLastHealthCheckHttpStatus(uint64_t) PURE;

/**
* @return the HTTP status code from the last active health check response, or
* 0 if no response has been recorded.
*/
virtual absl::optional<uint64_t> lastHealthCheckHttpStatus() const PURE;
};

using HostConstSharedPtr = std::shared_ptr<const Host>;
Expand Down
8 changes: 8 additions & 0 deletions source/common/upstream/health_discovery_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,14 @@ envoy::service::health::v3::HealthCheckRequestOrEndpointHealthResponse HdsDelega
}
}

// If a HTTP health check has run, attach the last response code to the
// HDS report so the control plane can interpret richer health states.
auto http_status = host->lastHealthCheckHttpStatus();
if (http_status.has_value()) {
(*endpoint->mutable_health_metadata()->mutable_fields())["http_status_code"]
.set_number_value(http_status.value());
}

// TODO(drewsortega): remove this once we are on v4 and endpoint_health_response is
// removed. Copy this endpoint's health info to the legacy flat-list.
response.mutable_endpoint_health_response()->add_endpoints_health()->MergeFrom(*endpoint);
Expand Down
6 changes: 6 additions & 0 deletions source/common/upstream/upstream_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,11 @@ class HostImplBase : public Host,
uint32_t healthFlagsGetAll() const override { return health_flags_; }
void healthFlagsSetAll(uint32_t bits) override { health_flags_ |= bits; }

void setLastHealthCheckHttpStatus(uint64_t status) override { last_hc_http_status_ = status; }
absl::optional<uint64_t> lastHealthCheckHttpStatus() const override {
return last_hc_http_status_;
}

Host::HealthStatus healthStatus() const override {
// Evaluate active health status first.

Expand Down Expand Up @@ -471,6 +476,7 @@ class HostImplBase : public Host,
// flag access? May be we could refactor HealthFlag to contain all these statuses and flags in the
// future.
std::atomic<Host::HealthStatus> eds_health_status_{};
absl::optional<std::atomic<uint64_t>> last_hc_http_status_ = absl::nullopt;

struct HostHandleImpl : HostHandle {
HostHandleImpl(const std::shared_ptr<const HostImplBase>& parent) : parent_(parent) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,11 @@ HttpHealthCheckerImpl::HttpActiveHealthCheckSession::healthCheckResult() {
void HttpHealthCheckerImpl::HttpActiveHealthCheckSession::onResponseComplete() {
request_in_flight_ = false;

// Store the raw HTTP response code on the host for HDS metadata reporting.
if (response_headers_ != nullptr) {
host_->setLastHealthCheckHttpStatus(Http::Utility::getResponseStatus(*response_headers_));
}

switch (healthCheckResult()) {
case HealthCheckResult::Succeeded:
handleSuccess(false);
Expand Down
77 changes: 77 additions & 0 deletions test/common/upstream/hds_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1288,5 +1288,82 @@ TEST_F(HdsTest, TestCustomHealthCheckPortWhenUpdate) {
}
}

// Test that sendResponse includes health_metadata with http_status_code
// when setHealthCheckMetadata has been called on a host.
TEST_F(HdsTest, TestSendResponseWithHealthMetadata) {
EXPECT_CALL(*async_client_, startRaw(_, _, _, _)).WillOnce(Return(&async_stream_));
EXPECT_CALL(async_stream_, sendMessageRaw_(_, _));
createHdsDelegate();

// Create Message
message.reset(createSimpleMessage());

Network::MockClientConnection* connection_ = new NiceMock<Network::MockClientConnection>();
EXPECT_CALL(server_context_.dispatcher_, createClientConnection_(_, _, _, _))
.WillRepeatedly(Return(connection_));
EXPECT_CALL(*server_response_timer_, enableTimer(_, _)).Times(2);
EXPECT_CALL(async_stream_, sendMessageRaw_(_, false));
EXPECT_CALL(*test_factory_, createClusterInfo(_)).WillOnce(Return(cluster_info_));
EXPECT_CALL(*connection_, setBufferLimits(_));
EXPECT_CALL(server_context_.dispatcher_, deferredDelete_(_));

// Process message
hds_delegate_->onReceiveMessage(std::move(message));
connection_->raiseEvent(Network::ConnectionEvent::Connected);

// Set the HTTP response code on the host to simulate a health check result.
auto& host = hds_delegate_->hdsClusters()[0]->prioritySet().hostSetsPerPriority()[0]->hosts()[0];
host->setLastHealthCheckHttpStatus(200);

// Send Response
auto msg = hds_delegate_->sendResponse();

// Verify health_metadata contains the HTTP status code in the legacy flat list.
const auto& endpoint_health = msg.endpoint_health_response().endpoints_health(0);
ASSERT_TRUE(endpoint_health.has_health_metadata());
const auto& fields = endpoint_health.health_metadata().fields();
auto it = fields.find("http_status_code");
ASSERT_NE(it, fields.end());
EXPECT_EQ(it->second.number_value(), 200.0);

// Also verify the structured (cluster-based) response.
const auto& cluster_endpoint =
msg.endpoint_health_response().cluster_endpoints_health(0).locality_endpoints_health(0);
const auto& structured_endpoint = cluster_endpoint.endpoints_health(0);
ASSERT_TRUE(structured_endpoint.has_health_metadata());
EXPECT_EQ(structured_endpoint.health_metadata().fields().at("http_status_code").number_value(),
200.0);
}

// Test that sendResponse does not include health_metadata when no metadata
// has been set on the host (default empty Struct).
TEST_F(HdsTest, TestSendResponseNoHealthMetadataByDefault) {
EXPECT_CALL(*async_client_, startRaw(_, _, _, _)).WillOnce(Return(&async_stream_));
EXPECT_CALL(async_stream_, sendMessageRaw_(_, _));
createHdsDelegate();

// Create Message
message.reset(createSimpleMessage());

Network::MockClientConnection* connection_ = new NiceMock<Network::MockClientConnection>();
EXPECT_CALL(server_context_.dispatcher_, createClientConnection_(_, _, _, _))
.WillRepeatedly(Return(connection_));
EXPECT_CALL(*server_response_timer_, enableTimer(_, _)).Times(2);
EXPECT_CALL(async_stream_, sendMessageRaw_(_, false));
EXPECT_CALL(*test_factory_, createClusterInfo(_)).WillOnce(Return(cluster_info_));
EXPECT_CALL(*connection_, setBufferLimits(_));
EXPECT_CALL(server_context_.dispatcher_, deferredDelete_(_));

// Process message (do NOT set any health metadata on host)
hds_delegate_->onReceiveMessage(std::move(message));
connection_->raiseEvent(Network::ConnectionEvent::Connected);

// Send Response
auto msg = hds_delegate_->sendResponse();

// Verify health_metadata is absent when no metadata has been set.
EXPECT_FALSE(msg.endpoint_health_response().endpoints_health(0).has_health_metadata());
}

} // namespace Upstream
} // namespace Envoy
36 changes: 36 additions & 0 deletions test/common/upstream/health_checker_impl_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -899,6 +899,42 @@ TEST_F(HttpHealthCheckerImplTest, Success) {
cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->coarseHealth());
}

// Verify that lastHealthCheckHttpStatus is recorded for a 200 response and
// updated on a subsequent 503.
TEST_F(HttpHealthCheckerImplTest, LastHealthCheckHttpStatusRecorded) {
setupNoServiceValidationHC();
EXPECT_CALL(*this, onHostStatus(_, _)).Times(2);

cluster_->prioritySet().getMockHostSet(0)->hosts_ = {
makeTestHost(cluster_->info_, "tcp://127.0.0.1:80")};
cluster_->info_->trafficStats()->upstream_cx_total_.inc();
expectSessionCreate();
expectStreamCreate(0);
EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _));
health_checker_->start();

EXPECT_CALL(runtime_.snapshot_, getInteger("health_check.max_interval", _));
EXPECT_CALL(runtime_.snapshot_, getInteger("health_check.min_interval", _))
.WillRepeatedly(Return(45000));
EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(_, _));
EXPECT_CALL(*test_sessions_[0]->timeout_timer_, disableTimer());
respond(0, "200", false, false, true);
EXPECT_EQ(200U,
cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->lastHealthCheckHttpStatus());

// A second check with a 503 should overwrite the stored status.
EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _));
expectStreamCreate(0);
test_sessions_[0]->interval_timer_->invokeCallback();
EXPECT_CALL(runtime_.snapshot_, getInteger("health_check.max_interval", _));
EXPECT_CALL(*test_sessions_[0]->interval_timer_, enableTimer(_, _));
EXPECT_CALL(*test_sessions_[0]->timeout_timer_, disableTimer());
EXPECT_CALL(event_logger_, logEjectUnhealthy(_, _, _));
respond(0, "503", false, false, true);
EXPECT_EQ(503U,
cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->lastHealthCheckHttpStatus());
}

TEST_F(HttpHealthCheckerImplTest, Degraded) {
setupNoServiceValidationHC();
EXPECT_CALL(*this, onHostStatus(_, HealthTransition::Changed)).Times(2);
Expand Down
2 changes: 2 additions & 0 deletions test/mocks/upstream/host.h
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,8 @@ class MockHostLight : public Host {
MOCK_METHOD(absl::optional<MonotonicTime>, lastHcPassTime, (), (const));
MOCK_METHOD(void, setLbPolicyData, (HostLbPolicyDataPtr lb_policy_data));
MOCK_METHOD(OptRef<HostLbPolicyData>, lbPolicyData, (), (const));
MOCK_METHOD(void, setLastHealthCheckHttpStatus, (uint64_t));
MOCK_METHOD(absl::optional<uint64_t>, lastHealthCheckHttpStatus, (), (const));

bool disable_active_health_check_ = false;
};
Expand Down
Loading