Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/queue/health/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,6 @@ func ProbeHandler(healthState *State, prober func() bool, isAggressive bool, tra
return false
}
return true
}, isAggressive, w)
}, w)
}
}
8 changes: 2 additions & 6 deletions pkg/queue/health/health_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,8 @@ func (h *State) drainFinished() {
}

// HandleHealthProbe handles the probe according to the current state of the
// health server. If isAggressive is false and prober has succeeded previously,
// the function returns success without probing user-container again (until
// shutdown).
func (h *State) HandleHealthProbe(prober func() bool, isAggressive bool, w http.ResponseWriter) {
// health server.
func (h *State) HandleHealthProbe(prober func() bool, w http.ResponseWriter) {
sendAlive := func() {
io.WriteString(w, queue.Name)
}
Expand All @@ -107,8 +105,6 @@ func (h *State) HandleHealthProbe(prober func() bool, isAggressive bool, w http.
}

switch {
case !isAggressive && h.isAlive():
sendAlive()
case h.isShuttingDown():
sendShuttingDown()
case prober != nil && !prober():
Expand Down
81 changes: 30 additions & 51 deletions pkg/queue/health/health_state_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,79 +67,58 @@ func TestHealthStateHealthHandler(t *testing.T) {
alive bool
shuttingDown bool
prober func() bool
isAggressive bool
wantStatus int
wantBody string
}{{
name: "alive: true, K-Probe",
alive: true,
isAggressive: false,
wantStatus: http.StatusOK,
wantBody: queue.Name,
name: "alive: true",
alive: true,
wantStatus: http.StatusOK,
wantBody: queue.Name,
}, {
name: "alive: false, prober: true, K-Probe",
prober: func() bool { return true },
isAggressive: false,
wantStatus: http.StatusOK,
wantBody: queue.Name,
name: "alive: false, prober: true",
prober: func() bool { return true },
wantStatus: http.StatusOK,
wantBody: queue.Name,
}, {
name: "alive: false, prober: false, K-Probe",
prober: func() bool { return false },
isAggressive: false,
wantStatus: http.StatusServiceUnavailable,
name: "alive: false, prober: false",
prober: func() bool { return false },
wantStatus: http.StatusServiceUnavailable,
}, {
name: "alive: false, no prober, K-Probe",
isAggressive: false,
wantStatus: http.StatusOK,
wantBody: queue.Name,
name: "alive: false, no prober",
wantStatus: http.StatusOK,
wantBody: queue.Name,
}, {
name: "shuttingDown: true, K-Probe",
name: "shuttingDown: true",
shuttingDown: true,
isAggressive: false,
wantStatus: http.StatusGone,
}, {
name: "no prober, shuttingDown: false",
isAggressive: true,
wantStatus: http.StatusOK,
wantBody: queue.Name,
name: "no prober, shuttingDown: false",
wantStatus: http.StatusOK,
wantBody: queue.Name,
}, {
name: "prober: true, shuttingDown: true",
shuttingDown: true,
prober: func() bool { return true },
isAggressive: true,
wantStatus: http.StatusGone,
}, {
name: "prober: true, shuttingDown: false",
prober: func() bool { return true },
isAggressive: true,
wantStatus: http.StatusOK,
wantBody: queue.Name,
name: "prober: true, shuttingDown: false",
prober: func() bool { return true },
wantStatus: http.StatusOK,
wantBody: queue.Name,
}, {
name: "prober: false, shuttingDown: false",
prober: func() bool { return false },
isAggressive: true,
wantStatus: http.StatusServiceUnavailable,
name: "prober: false, shuttingDown: false",
prober: func() bool { return false },
wantStatus: http.StatusServiceUnavailable,
}, {
name: "prober: false, shuttingDown: true",
shuttingDown: true,
prober: func() bool { return false },
isAggressive: true,
wantStatus: http.StatusGone,
}, {
name: "alive: true, prober: false, shuttingDown: false",
alive: true,
prober: func() bool { return false },
isAggressive: true,
wantStatus: http.StatusServiceUnavailable,
}, {
name: "aggressive: false, alive: true, prober: false, shuttingDown: false",
alive: true,
prober: func() bool { return false },
isAggressive: false,
// The current behaviour is that if aggressive is false (ie probePeriod > 0),
// we actually don't probe once the first probe succeeds.
wantStatus: http.StatusOK,
wantBody: queue.Name,
name: "alive: true, prober: false, shuttingDown: false",
alive: true,
prober: func() bool { return false },
wantStatus: http.StatusServiceUnavailable,
}}

for _, test := range tests {
Expand All @@ -149,7 +128,7 @@ func TestHealthStateHealthHandler(t *testing.T) {
state.shuttingDown = test.shuttingDown

rr := httptest.NewRecorder()
state.HandleHealthProbe(test.prober, test.isAggressive, rr)
state.HandleHealthProbe(test.prober, rr)

if got, want := rr.Code, test.wantStatus; got != want {
t.Errorf("handler returned wrong status code: got %v want %v", got, want)
Expand Down
120 changes: 41 additions & 79 deletions test/conformance/runtime/readiness_probe_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,96 +148,58 @@ func TestProbeRuntime(t *testing.T) {
}

// This test validates the behaviour of readiness probes *after* initial
// startup. Note: the current behaviour is not ideal: when periodSeconds > 0 we
// ignore readiness probes after startup. Also, when a pod goes unready after
// startup and there are no other pods in the revision we hang, potentially
// forever, which may not be what a user wants. The goal of this test is
// largely to describe the current behaviour, so that we can confidently change it.
// See:
// - https://github.com/knative/serving/issues/10764
// - https://github.com/knative/serving/issues/10765
// startup. When a pod goes unready after startup and there are no other pods
// in the revision we hang, potentially forever, which may or may not be what a
// user wants.
// The goal of this test is largely to describe the current behaviour, so that
// we can confidently change it.
// See https://github.com/knative/serving/issues/10765.
func TestProbeRuntimeAfterStartup(t *testing.T) {
t.Parallel()

t.Run("with periodSeconds = 0 (QP optimisation enabled)", func(t *testing.T) {
t.Parallel()
clients := test.Setup(t)
names := test.ResourceNames{Service: test.ObjectNameForTest(t), Image: test.Readiness}
test.EnsureTearDown(t, clients, &names)
for _, period := range []int32{0, 1} {
period := period
t.Run(fmt.Sprintf("periodSeconds=%d", period), func(t *testing.T) {
t.Parallel()
clients := test.Setup(t)
names := test.ResourceNames{Service: test.ObjectNameForTest(t), Image: test.Readiness}
test.EnsureTearDown(t, clients, &names)

url, client := waitReadyThenStartFailing(t, clients, names, period)
if err := wait.PollImmediate(1*time.Second, readinessPropagationTime, func() (bool, error) {
startFailing, err := http.NewRequest(http.MethodGet, url.String(), nil)
if err != nil {
return false, err
}

// When periodSeconds = 0 we expect that once readiness propagates
// there will be no ready pods, and therefore the request will time
// out.
// (see https://github.com/knative/serving/issues/10765).
url, client := waitReadyThenStartFailing(t, clients, names, 0)
if err := wait.PollImmediate(1*time.Second, readinessPropagationTime, func() (bool, error) {
startFailing, err := http.NewRequest(http.MethodGet, url.String(), nil)
if err != nil {
return false, err
}
// 5 Seconds is enough to be confident the request is timing out.
client.Client.Timeout = 5 * time.Second

// 5 Seconds is enough to be confident the request is timing out.
client.Client.Timeout = 5 * time.Second
resp, err := client.Do(startFailing, func(err error) (bool, error) {
if isTimeout(err) {
// We're actually expecting a timeout here, so don't retry on timeouts.
return false, nil
}

resp, err := client.Do(startFailing, func(err error) (bool, error) {
return spoof.DefaultErrorRetryChecker(err)
})
if isTimeout(err) {
// We're actually expecting a timeout here, so don't retry on timeouts.
// We expect to eventually time out, so this is the success case.
return true, nil
} else if err != nil {
// Other errors are not expected.
return false, err
} else if resp.StatusCode == http.StatusOK {
// We'll continue to get 200s for a while until readiness propagates.
return false, nil
}

return spoof.DefaultErrorRetryChecker(err)
})
if isTimeout(err) {
// We expect to eventually time out, so this is the success case.
return true, nil
} else if err != nil {
// Other errors are not expected.
return false, err
} else if resp.StatusCode == http.StatusOK {
// We'll continue to get 200s for a while until readiness propagates.
return false, nil
}

return false, errors.New("Received non-200 status code (expected to eventually time out)")
}); err != nil {
t.Fatal("Expected to eventually see request timeout due to all pods becoming unready, but got:", err)
}
})

t.Run("with periodSeconds > 0 (no QP optimisation)", func(t *testing.T) {
t.Parallel()
clients := test.Setup(t)
names := test.ResourceNames{Service: test.ObjectNameForTest(t), Image: test.Readiness}
test.EnsureTearDown(t, clients, &names)

// When periodSeconds > 0 we ignore readinessProbes after startup, so
// this request will continue to work, even though all pods are failing
// readiness.
// (see https://github.com/knative/serving/issues/10764).
url, _ := waitReadyThenStartFailing(t, clients, names, 1)
if err := wait.PollImmediate(1*time.Second, readinessPropagationTime, func() (bool, error) {
if _, err := pkgtest.CheckEndpointState(
context.Background(),
clients.KubeClient,
t.Logf,
url,
v1test.RetryingRouteInconsistency(spoof.MatchesAllOf(spoof.IsStatusOK, spoof.MatchesBody(test.HelloWorldText))),
"readinessIsReady",
test.ServingFlags.ResolvableDomain,
test.AddRootCAtoTransport(context.Background(), t.Logf, clients, test.ServingFlags.HTTPS),
); err != nil {
return false, err
return false, errors.New("Received non-200 status code (expected to eventually time out)")
}); err != nil {
t.Fatal("Expected to eventually see request timeout due to all pods becoming unready, but got:", err)
}

// Keep retrying for readinessPropagationTime to allow
// readiness to propagate, otherwise we could get a false
// positive since ingresses and activator may not have noticed
// the readiness change yet.
return false, nil
}); !errors.Is(err, wait.ErrWaitTimeout) {
t.Fatalf("Expected endpoint for Route %s at %s to consistently return success (even though readinessProbe is failing), but got: %v", names.Route, url, err)
}
})
})
}
}

// waitReadyThenStartFailing creates a service, waits for it to pass readiness,
Expand Down