knative · knative-prow-robot · Aug 19, 2021 · Aug 17, 2021
diff --git a/test/conformance/runtime/readiness_probe_test.go b/test/conformance/runtime/readiness_probe_test.go
@@ -24,6 +24,7 @@ import (
 	"fmt"
 	"net"
 	"net/http"
+	"net/url"
 	"testing"
 	"time"
 
@@ -146,27 +147,107 @@ func TestProbeRuntime(t *testing.T) {
 	}
 }
 
-// This test validates the behaviour of readiness probes *after* initial startup.
-// The current behaviour is not ideal: when a pod goes unready after startup
-// and there are no other pods in the revision we hang, potentially forever,
-// which may not be what a user wants.
-// See https://github.com/knative/serving/issues/10765.
+// This test validates the behaviour of readiness probes *after* initial
+// startup. Note: the current behaviour is not ideal: when periodSeconds > 0 we
+// ignore readiness probes after startup. Also, when a pod goes unready after
+// startup and there are no other pods in the revision we hang, potentially
+// forever, which may not be what a user wants. The goal of this test is
+// largely to describe the current behaviour, so that we can confidently change it.
+// See:
+//  - https://github.com/knative/serving/issues/10764
+//  - https://github.com/knative/serving/issues/10765
 func TestProbeRuntimeAfterStartup(t *testing.T) {
 	t.Parallel()
-	clients := test.Setup(t)
 
-	names := test.ResourceNames{
-		Service: test.ObjectNameForTest(t),
-		Image:   test.Readiness,
-	}
+	t.Run("with periodSeconds = 0 (QP optimisation enabled)", func(t *testing.T) {
+		t.Parallel()
+		clients := test.Setup(t)
+		names := test.ResourceNames{Service: test.ObjectNameForTest(t), Image: test.Readiness}
+		test.EnsureTearDown(t, clients, &names)
+
+		// When periodSeconds = 0 we expect that once readiness propagates
+		// there will be no ready pods, and therefore the request will time
+		// out.
+		// (see https://github.com/knative/serving/issues/10765).
+		url, client := waitReadyThenStartFailing(t, clients, names, 0)
+		if err := wait.PollImmediate(1*time.Second, readinessPropagationTime, func() (bool, error) {
+			startFailing, err := http.NewRequest(http.MethodGet, url.String(), nil)
+			if err != nil {
+				return false, err
+			}
+
+			// 5 Seconds is enough to be confident the request is timing out.
+			client.Client.Timeout = 5 * time.Second
+
+			resp, err := client.Do(startFailing, func(err error) (bool, error) {
+				if isTimeout(err) {
+					// We're actually expecting a timeout here, so don't retry on timeouts.
+					return false, nil
+				}
+
+				return spoof.DefaultErrorRetryChecker(err)
+			})
+			if isTimeout(err) {
+				// We expect to eventually time out, so this is the success case.
+				return true, nil
+			} else if err != nil {
+				// Other errors are not expected.
+				return false, err
+			} else if resp.StatusCode == http.StatusOK {
+				// We'll continue to get 200s for a while until readiness propagates.
+				return false, nil
+			}
+
+			return false, errors.New("Received non-200 status code (expected to eventually time out)")
+		}); err != nil {
+			t.Fatal("Expected to eventually see request timeout due to all pods becoming unready, but got:", err)
+		}
+	})
+
+	t.Run("with periodSeconds > 0 (no QP optimisation)", func(t *testing.T) {
+		t.Parallel()
+		clients := test.Setup(t)
+		names := test.ResourceNames{Service: test.ObjectNameForTest(t), Image: test.Readiness}
+		test.EnsureTearDown(t, clients, &names)
+
+		// When periodSeconds > 0 we ignore readinessProbes after startup, so
+		// this request will continue to work, even though all pods are failing
+		// readiness.
+		// (see https://github.com/knative/serving/issues/10764).
+		url, _ := waitReadyThenStartFailing(t, clients, names, 1)
+		if err := wait.PollImmediate(1*time.Second, readinessPropagationTime, func() (bool, error) {
+			if _, err := pkgtest.CheckEndpointState(
+				context.Background(),
+				clients.KubeClient,
+				t.Logf,
+				url,
+				v1test.RetryingRouteInconsistency(spoof.MatchesAllOf(spoof.IsStatusOK, spoof.MatchesBody(test.HelloWorldText))),
+				"readinessIsReady",
+				test.ServingFlags.ResolvableDomain,
+				test.AddRootCAtoTransport(context.Background(), t.Logf, clients, test.ServingFlags.HTTPS),
+			); err != nil {
+				return false, err
+			}
+
+			// Keep retrying for readinessPropagationTime to allow
+			// readiness to propagate, otherwise we could get a false
+			// positive since ingresses and activator may not have noticed
+			// the readiness change yet.
+			return false, nil
+		}); !errors.Is(err, wait.ErrWaitTimeout) {
+			t.Fatalf("Expected endpoint for Route %s at %s to consistently return success (even though readinessProbe is failing), but got: %v", names.Route, url, err)
+		}
+	})
+}
 
-	test.EnsureTearDown(t, clients, &names)
+// waitReadyThenStartFailing creates a service, waits for it to pass readiness,
+// and then causes its readiness test to start failing. It returns the URL of
+// an endpoint on the created service, and an appropriate spoofing client to
+// use to access it.
+func waitReadyThenStartFailing(t *testing.T, clients *test.Clients, names test.ResourceNames, probePeriod int32) (*url.URL, *spoof.SpoofingClient) {
 	resources, err := v1test.CreateServiceReady(t, clients, &names, v1opts.WithReadinessProbe(
 		&corev1.Probe{
-			// This behaviour is only the case where periodSeconds=0, because probes
-			// are ignored after startup when periodSeconds>0
-			// See https://github.com/knative/serving/issues/10764.
-			PeriodSeconds: 0,
+			PeriodSeconds: probePeriod,
 			Handler: corev1.Handler{
 				HTTPGet: &corev1.HTTPGetAction{
 					Path: "/healthz",
@@ -218,41 +299,7 @@ func TestProbeRuntimeAfterStartup(t *testing.T) {
 	}
 
 	url.Path = "/"
-	// When periodSeconds = 0 we expect that once readiness propagates
-	// there will be no ready pods, and therefore the request will time
-	// out. (see https://github.com/knative/serving/issues/10765).
-	if err := wait.PollImmediate(1*time.Second, readinessPropagationTime, func() (bool, error) {
-		startFailing, err := http.NewRequest(http.MethodGet, url.String(), nil)
-		if err != nil {
-			return false, err
-		}
-
-		// 5 Seconds is enough to be confident the request is timing out.
-		client.Client.Timeout = 5 * time.Second
-
-		resp, err := client.Do(startFailing, func(err error) (bool, error) {
-			if isTimeout(err) {
-				// We're actually expecting a timeout here, so don't retry on timeouts.
-				return false, nil
-			}
-
-			return spoof.DefaultErrorRetryChecker(err)
-		})
-		if isTimeout(err) {
-			// We expect to eventually time out, so this is the success case.
-			return true, nil
-		} else if err != nil {
-			// Other errors are not expected.
-			return false, err
-		} else if resp.StatusCode == http.StatusOK {
-			// We'll continue to get 200s for a while until readiness propagates.
-			return false, nil
-		}
-
-		return false, errors.New("Received non-200 status code (expected to eventually time out)")
-	}); err != nil {
-		t.Fatal("Expected to eventually see request timeout due to all pods becoming unready, but got:", err)
-	}
+	return url, client
 }
 
 func isTimeout(err error) bool {