From 4b3390b29bc29fdac06def795adcf8fc014126bb Mon Sep 17 00:00:00 2001 From: Julz Friedman Date: Wed, 14 Apr 2021 22:13:57 +0100 Subject: [PATCH 1/3] Probe properly after startup when periodSeconds greater than zero --- pkg/queue/health/handler.go | 2 +- pkg/queue/health/health_state.go | 8 +-- pkg/queue/health/health_state_test.go | 79 ++++++++++----------------- 3 files changed, 32 insertions(+), 57 deletions(-) diff --git a/pkg/queue/health/handler.go b/pkg/queue/health/handler.go index 8f80924fda70..7bca57f51092 100644 --- a/pkg/queue/health/handler.go +++ b/pkg/queue/health/handler.go @@ -65,6 +65,6 @@ func ProbeHandler(healthState *State, prober func() bool, isAggressive bool, tra return false } return true - }, isAggressive, w) + }, w) } } diff --git a/pkg/queue/health/health_state.go b/pkg/queue/health/health_state.go index ce2b541ffbf4..0e9750bc5123 100644 --- a/pkg/queue/health/health_state.go +++ b/pkg/queue/health/health_state.go @@ -90,10 +90,8 @@ func (h *State) drainFinished() { } // HandleHealthProbe handles the probe according to the current state of the -// health server. If isAggressive is false and prober has succeeded previously, -// the function returns success without probing user-container again (until -// shutdown). -func (h *State) HandleHealthProbe(prober func() bool, isAggressive bool, w http.ResponseWriter) { +// health server. +func (h *State) HandleHealthProbe(prober func() bool, w http.ResponseWriter) { sendAlive := func() { io.WriteString(w, queue.Name) } @@ -107,8 +105,6 @@ func (h *State) HandleHealthProbe(prober func() bool, isAggressive bool, w http. } switch { - case !isAggressive && h.isAlive(): - sendAlive() case h.isShuttingDown(): sendShuttingDown() case prober != nil && !prober(): diff --git a/pkg/queue/health/health_state_test.go b/pkg/queue/health/health_state_test.go index e71e32c9ae02..f4711efcea86 100644 --- a/pkg/queue/health/health_state_test.go +++ b/pkg/queue/health/health_state_test.go @@ -67,79 +67,58 @@ func TestHealthStateHealthHandler(t *testing.T) { alive bool shuttingDown bool prober func() bool - isAggressive bool wantStatus int wantBody string }{{ - name: "alive: true, K-Probe", - alive: true, - isAggressive: false, - wantStatus: http.StatusOK, - wantBody: queue.Name, + name: "alive: true, K-Probe", + alive: true, + wantStatus: http.StatusOK, + wantBody: queue.Name, }, { - name: "alive: false, prober: true, K-Probe", - prober: func() bool { return true }, - isAggressive: false, - wantStatus: http.StatusOK, - wantBody: queue.Name, + name: "alive: false, prober: true, K-Probe", + prober: func() bool { return true }, + wantStatus: http.StatusOK, + wantBody: queue.Name, }, { - name: "alive: false, prober: false, K-Probe", - prober: func() bool { return false }, - isAggressive: false, - wantStatus: http.StatusServiceUnavailable, + name: "alive: false, prober: false, K-Probe", + prober: func() bool { return false }, + wantStatus: http.StatusServiceUnavailable, }, { - name: "alive: false, no prober, K-Probe", - isAggressive: false, - wantStatus: http.StatusOK, - wantBody: queue.Name, + name: "alive: false, no prober, K-Probe", + wantStatus: http.StatusOK, + wantBody: queue.Name, }, { name: "shuttingDown: true, K-Probe", shuttingDown: true, - isAggressive: false, wantStatus: http.StatusGone, }, { - name: "no prober, shuttingDown: false", - isAggressive: true, - wantStatus: http.StatusOK, - wantBody: queue.Name, + name: "no prober, shuttingDown: false", + wantStatus: http.StatusOK, + wantBody: queue.Name, }, { name: "prober: true, shuttingDown: true", shuttingDown: true, prober: func() bool { return true }, - isAggressive: true, wantStatus: http.StatusGone, }, { - name: "prober: true, shuttingDown: false", - prober: func() bool { return true }, - isAggressive: true, - wantStatus: http.StatusOK, - wantBody: queue.Name, + name: "prober: true, shuttingDown: false", + prober: func() bool { return true }, + wantStatus: http.StatusOK, + wantBody: queue.Name, }, { - name: "prober: false, shuttingDown: false", - prober: func() bool { return false }, - isAggressive: true, - wantStatus: http.StatusServiceUnavailable, + name: "prober: false, shuttingDown: false", + prober: func() bool { return false }, + wantStatus: http.StatusServiceUnavailable, }, { name: "prober: false, shuttingDown: true", shuttingDown: true, prober: func() bool { return false }, - isAggressive: true, wantStatus: http.StatusGone, }, { - name: "alive: true, prober: false, shuttingDown: false", - alive: true, - prober: func() bool { return false }, - isAggressive: true, - wantStatus: http.StatusServiceUnavailable, - }, { - name: "aggressive: false, alive: true, prober: false, shuttingDown: false", - alive: true, - prober: func() bool { return false }, - isAggressive: false, - // The current behaviour is that if aggressive is false (ie probePeriod > 0), - // we actually don't probe once the first probe succeeds. - wantStatus: http.StatusOK, - wantBody: queue.Name, + name: "alive: true, prober: false, shuttingDown: false", + alive: true, + prober: func() bool { return false }, + wantStatus: http.StatusServiceUnavailable, }} for _, test := range tests { @@ -149,7 +128,7 @@ func TestHealthStateHealthHandler(t *testing.T) { state.shuttingDown = test.shuttingDown rr := httptest.NewRecorder() - state.HandleHealthProbe(test.prober, test.isAggressive, rr) + state.HandleHealthProbe(test.prober, rr) if got, want := rr.Code, test.wantStatus; got != want { t.Errorf("handler returned wrong status code: got %v want %v", got, want) From 3dc13867868daf6cf064563371ea417252dea5d7 Mon Sep 17 00:00:00 2001 From: Julz Friedman Date: Thu, 19 Aug 2021 14:52:14 +0100 Subject: [PATCH 2/3] Update comformance test for fixed behaviour --- .../runtime/readiness_probe_test.go | 120 ++++++------------ 1 file changed, 41 insertions(+), 79 deletions(-) diff --git a/test/conformance/runtime/readiness_probe_test.go b/test/conformance/runtime/readiness_probe_test.go index de6253f883f1..809c4c627913 100644 --- a/test/conformance/runtime/readiness_probe_test.go +++ b/test/conformance/runtime/readiness_probe_test.go @@ -148,96 +148,58 @@ func TestProbeRuntime(t *testing.T) { } // This test validates the behaviour of readiness probes *after* initial -// startup. Note: the current behaviour is not ideal: when periodSeconds > 0 we -// ignore readiness probes after startup. Also, when a pod goes unready after -// startup and there are no other pods in the revision we hang, potentially -// forever, which may not be what a user wants. The goal of this test is -// largely to describe the current behaviour, so that we can confidently change it. -// See: -// - https://github.com/knative/serving/issues/10764 -// - https://github.com/knative/serving/issues/10765 +// startup. When a pod goes unready after startup and there are no other pods +// in the revision we hang, potentially forever, which may or may not be what a +// user wants. +// The goal of this test is largely to describe the current behaviour, so that +// we can confidently change it. +// See https://github.com/knative/serving/issues/10765. func TestProbeRuntimeAfterStartup(t *testing.T) { t.Parallel() - t.Run("with periodSeconds = 0 (QP optimisation enabled)", func(t *testing.T) { - t.Parallel() - clients := test.Setup(t) - names := test.ResourceNames{Service: test.ObjectNameForTest(t), Image: test.Readiness} - test.EnsureTearDown(t, clients, &names) + for _, period := range []int32{0, 1} { + period := period + t.Run(fmt.Sprintf("periodSeconds=%d", period), func(t *testing.T) { + t.Parallel() + clients := test.Setup(t) + names := test.ResourceNames{Service: test.ObjectNameForTest(t), Image: test.Readiness} + test.EnsureTearDown(t, clients, &names) + + url, client := waitReadyThenStartFailing(t, clients, names, period) + if err := wait.PollImmediate(1*time.Second, readinessPropagationTime, func() (bool, error) { + startFailing, err := http.NewRequest(http.MethodGet, url.String(), nil) + if err != nil { + return false, err + } - // When periodSeconds = 0 we expect that once readiness propagates - // there will be no ready pods, and therefore the request will time - // out. - // (see https://github.com/knative/serving/issues/10765). - url, client := waitReadyThenStartFailing(t, clients, names, 0) - if err := wait.PollImmediate(1*time.Second, readinessPropagationTime, func() (bool, error) { - startFailing, err := http.NewRequest(http.MethodGet, url.String(), nil) - if err != nil { - return false, err - } + // 5 Seconds is enough to be confident the request is timing out. + client.Client.Timeout = 5 * time.Second - // 5 Seconds is enough to be confident the request is timing out. - client.Client.Timeout = 5 * time.Second + resp, err := client.Do(startFailing, func(err error) (bool, error) { + if isTimeout(err) { + // We're actually expecting a timeout here, so don't retry on timeouts. + return false, nil + } - resp, err := client.Do(startFailing, func(err error) (bool, error) { + return spoof.DefaultErrorRetryChecker(err) + }) if isTimeout(err) { - // We're actually expecting a timeout here, so don't retry on timeouts. + // We expect to eventually time out, so this is the success case. + return true, nil + } else if err != nil { + // Other errors are not expected. + return false, err + } else if resp.StatusCode == http.StatusOK { + // We'll continue to get 200s for a while until readiness propagates. return false, nil } - return spoof.DefaultErrorRetryChecker(err) - }) - if isTimeout(err) { - // We expect to eventually time out, so this is the success case. - return true, nil - } else if err != nil { - // Other errors are not expected. - return false, err - } else if resp.StatusCode == http.StatusOK { - // We'll continue to get 200s for a while until readiness propagates. - return false, nil - } - - return false, errors.New("Received non-200 status code (expected to eventually time out)") - }); err != nil { - t.Fatal("Expected to eventually see request timeout due to all pods becoming unready, but got:", err) - } - }) - - t.Run("with periodSeconds > 0 (no QP optimisation)", func(t *testing.T) { - t.Parallel() - clients := test.Setup(t) - names := test.ResourceNames{Service: test.ObjectNameForTest(t), Image: test.Readiness} - test.EnsureTearDown(t, clients, &names) - - // When periodSeconds > 0 we ignore readinessProbes after startup, so - // this request will continue to work, even though all pods are failing - // readiness. - // (see https://github.com/knative/serving/issues/10764). - url, _ := waitReadyThenStartFailing(t, clients, names, 1) - if err := wait.PollImmediate(1*time.Second, readinessPropagationTime, func() (bool, error) { - if _, err := pkgtest.CheckEndpointState( - context.Background(), - clients.KubeClient, - t.Logf, - url, - v1test.RetryingRouteInconsistency(spoof.MatchesAllOf(spoof.IsStatusOK, spoof.MatchesBody(test.HelloWorldText))), - "readinessIsReady", - test.ServingFlags.ResolvableDomain, - test.AddRootCAtoTransport(context.Background(), t.Logf, clients, test.ServingFlags.HTTPS), - ); err != nil { - return false, err + return false, errors.New("Received non-200 status code (expected to eventually time out)") + }); err != nil { + t.Fatal("Expected to eventually see request timeout due to all pods becoming unready, but got:", err) } - - // Keep retrying for readinessPropagationTime to allow - // readiness to propagate, otherwise we could get a false - // positive since ingresses and activator may not have noticed - // the readiness change yet. - return false, nil - }); !errors.Is(err, wait.ErrWaitTimeout) { - t.Fatalf("Expected endpoint for Route %s at %s to consistently return success (even though readinessProbe is failing), but got: %v", names.Route, url, err) - } - }) + }) + } } // waitReadyThenStartFailing creates a service, waits for it to pass readiness, From e8e3baa96d63e359b181566567a2865bacf41337 Mon Sep 17 00:00:00 2001 From: Julz Friedman Date: Thu, 19 Aug 2021 15:50:13 +0100 Subject: [PATCH 3/3] Remove redundant K-Probe since this is no longer aggressive-probe specific --- pkg/queue/health/health_state_test.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/queue/health/health_state_test.go b/pkg/queue/health/health_state_test.go index f4711efcea86..74d004e7c357 100644 --- a/pkg/queue/health/health_state_test.go +++ b/pkg/queue/health/health_state_test.go @@ -70,25 +70,25 @@ func TestHealthStateHealthHandler(t *testing.T) { wantStatus int wantBody string }{{ - name: "alive: true, K-Probe", + name: "alive: true", alive: true, wantStatus: http.StatusOK, wantBody: queue.Name, }, { - name: "alive: false, prober: true, K-Probe", + name: "alive: false, prober: true", prober: func() bool { return true }, wantStatus: http.StatusOK, wantBody: queue.Name, }, { - name: "alive: false, prober: false, K-Probe", + name: "alive: false, prober: false", prober: func() bool { return false }, wantStatus: http.StatusServiceUnavailable, }, { - name: "alive: false, no prober, K-Probe", + name: "alive: false, no prober", wantStatus: http.StatusOK, wantBody: queue.Name, }, { - name: "shuttingDown: true, K-Probe", + name: "shuttingDown: true", shuttingDown: true, wantStatus: http.StatusGone, }, {