From d4cf7fe948af70911e14c70f8e0bf8d1b49a9440 Mon Sep 17 00:00:00 2001 From: dprotaso Date: Fri, 16 Feb 2024 11:12:28 -0500 Subject: [PATCH 1/4] include a test that reproduces issue #13677 --- test/conformance.go | 1 + test/e2e/defective_revision_test.go | 177 ++++++++++++++++++++++++++++ 2 files changed, 178 insertions(+) create mode 100644 test/e2e/defective_revision_test.go diff --git a/test/conformance.go b/test/conformance.go index ff31fd4d49f1..5dfb2d19097d 100644 --- a/test/conformance.go +++ b/test/conformance.go @@ -46,6 +46,7 @@ const ( PizzaPlanet1 = "pizzaplanetv1" PizzaPlanet2 = "pizzaplanetv2" Readiness = "readiness" + RevisionFailure = "revisionfailure" Runtime = "runtime" ServingContainer = "servingcontainer" SidecarContainer = "sidecarcontainer" diff --git a/test/e2e/defective_revision_test.go b/test/e2e/defective_revision_test.go new file mode 100644 index 000000000000..d9931d6368ec --- /dev/null +++ b/test/e2e/defective_revision_test.go @@ -0,0 +1,177 @@ +//go:build e2e +// +build e2e + +/* +Copyright 2019 The Knative Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "context" + "errors" + "net/http" + "testing" + "time" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "knative.dev/pkg/ptr" + pkgtest "knative.dev/pkg/test" + v1 "knative.dev/serving/pkg/apis/serving/v1" + "knative.dev/serving/test" + v1test "knative.dev/serving/test/v1" + + revisionnames "knative.dev/serving/pkg/reconciler/revision/resources/names" +) + +func TestDefectiveRevisionScalesDown(t *testing.T) { + t.Parallel() + clients := Setup(t) // This one uses the default namespace `test.ServingFlags.TestNamespace` + + resources := test.ResourceNames{ + Service: test.ObjectNameForTest(t), + Image: test.RevisionFailure, + } + test.EnsureTearDown(t, clients, &resources) + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: resources.Service, + }, + } + ctx := context.Background() + + serviceFunc := func(s *v1.Service) { + name := time.Now().String() + + s.Spec.Template.Spec.TimeoutSeconds = ptr.Int64(30) + s.Spec.Template.Spec.Containers[0].Env = []corev1.EnvVar{{ + Name: "NAME", + Value: name, + }} + s.Spec.Template.Spec.Containers[0].VolumeMounts = []corev1.VolumeMount{{ + Name: "failure-list", + MountPath: "/etc/config", + }} + s.Spec.Template.Spec.Volumes = []corev1.Volume{{ + Name: "failure-list", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: resources.Service, + }, + }, + }, + }} + } + + cmClient := clients.KubeClient.CoreV1().ConfigMaps(test.ServingFlags.TestNamespace) + if _, err := cmClient.Create(ctx, cm, metav1.CreateOptions{}); err != nil { + t.Fatal("failed to create configmap", err) + } + + t.Cleanup(func() { + cmClient.Delete(ctx, resources.Service, metav1.DeleteOptions{}) + }) + + objs, err := v1test.CreateServiceReady(t, clients, &resources, serviceFunc) + if err != nil { + t.Fatalf("Failed to create Service %q in namespace %q: %v", resources.Service, test.ServingFlags.TestNamespace, err) + } + + rev1DeploymentName := revisionnames.Deployment(objs.Revision) + if err = WaitForScaleToZero(t, rev1DeploymentName, clients); err != nil { + t.Fatalf("Failed to scale Service %q in namespace %q to zero: %v", resources.Service, test.ServingFlags.TestNamespace, err) + } + + // Update the ConfigMap to trigger the first revision to fail + cm.Data = map[string]string{resources.Revision: "fail"} + + if _, err := cmClient.Update(ctx, cm, metav1.UpdateOptions{}); err != nil { + t.Fatal("failed to trigger revision to fail", err) + } + + url := objs.Service.Status.URL + client, err := pkgtest.NewSpoofingClient(ctx, + clients.KubeClient, + t.Logf, + url.Host, + test.ServingFlags.ResolvableDomain, + test.AddRootCAtoTransport(context.Background(), t.Logf, clients, test.ServingFlags.HTTPS), + ) + if err != nil { + t.Fatal("failed to make spoof client") + } + + // trigger scale from zero + errs := make(chan error, 1) + go func() { + t.Log("making a request") + + now := time.Now() + req, _ := http.NewRequest("GET", url.String(), nil) + resp, err := client.Do(req) + + t.Log("making a request is done - took", time.Since(now)) + if err != nil { + errs <- err + } else if resp.StatusCode == http.StatusOK { + errs <- errors.New("expect scale up request to fail") + } + close(errs) + }() + + var requestErr error + err = pkgtest.WaitForDeploymentState( + context.Background(), + clients.KubeClient, + rev1DeploymentName, + func(d *appsv1.Deployment) (bool, error) { + // Fail if there's an error making the request + select { + case requestErr = <-errs: + return false, requestErr + default: + } + return d.Status.Replicas == 1 && d.Status.UnavailableReplicas == 1, nil + }, + "DeploymentIsScalingUp", + test.ServingFlags.TestNamespace, + 2*time.Minute, + ) + + if requestErr != nil { + t.Fatal("triggering request to revision failed: ", err) + } + + if err != nil { + t.Fatal("failed to wait for deployment to scale up: ", err) + } + + if _, err = v1test.UpdateService(t, clients, resources, serviceFunc); err != nil { + t.Fatal("failed to update service: ", err) + } + + if _, err = v1test.WaitForServiceLatestRevision(clients, resources); err != nil { + t.Fatal("failed to rollout new revision", err) + } + + if err = WaitForScaleToZero(t, rev1DeploymentName, clients); err != nil { + t.Fatal("first revision failed to scale down") + } +} From 3d1bd730a7332d75e5a438e8ac0adc99aa803604 Mon Sep 17 00:00:00 2001 From: dprotaso Date: Fri, 16 Feb 2024 14:56:37 -0500 Subject: [PATCH 2/4] debug why test didn't fail --- test/e2e-tests.sh | 150 +++++++++++++++++++++++----------------------- 1 file changed, 75 insertions(+), 75 deletions(-) diff --git a/test/e2e-tests.sh b/test/e2e-tests.sh index b134796b4239..815cd9b0ef65 100755 --- a/test/e2e-tests.sh +++ b/test/e2e-tests.sh @@ -36,7 +36,7 @@ header "Running tests" failed=0 # Run tests serially in the mesh and https scenarios. -GO_TEST_FLAGS="" +GO_TEST_FLAGS="-v -run TestDefect" E2E_TEST_FLAGS="${TEST_OPTIONS}" if [ -z "${E2E_TEST_FLAGS}" ]; then @@ -71,80 +71,80 @@ go_test_e2e -timeout=30m \ ./test/e2e \ ${E2E_TEST_FLAGS} || failed=1 -toggle_feature tag-header-based-routing Enabled -go_test_e2e -timeout=2m ./test/e2e/tagheader ${E2E_TEST_FLAGS} || failed=1 -toggle_feature tag-header-based-routing Disabled - -toggle_feature allow-zero-initial-scale true config-autoscaler || fail_test -go_test_e2e -timeout=2m ./test/e2e/initscale ${E2E_TEST_FLAGS} || failed=1 -toggle_feature allow-zero-initial-scale false config-autoscaler || fail_test - -go_test_e2e -timeout=2m ./test/e2e/domainmapping ${E2E_TEST_FLAGS} || failed=1 - -toggle_feature cluster-local-domain-tls enabled config-network || fail_test -go_test_e2e -timeout=2m ./test/e2e/clusterlocaldomaintls ${E2E_TEST_FLAGS} || failed=1 -toggle_feature cluster-local-domain-tls disabled config-network || fail_test - -toggle_feature system-internal-tls enabled config-network || fail_test -toggle_feature "logging.enable-request-log" true config-observability || fail_test -toggle_feature "logging.request-log-template" "TLS: {{.Request.TLS}}" config-observability || fail_test -# with current implementation, Activator must be restarted when configuring system-internal-tls. See https://github.com/knative/serving/issues/13754 -restart_pod ${SYSTEM_NAMESPACE} "app=activator" -go_test_e2e -timeout=2m ./test/e2e/systeminternaltls ${E2E_TEST_FLAGS} || failed=1 -toggle_feature system-internal-tls disabled config-network || fail_test -toggle_feature enable-request-log false config-observability || fail_test -toggle_feature request-log-template '' config-observability || fail_test -# with the current implementation, Activator is always in the request path, and needs to be restarted after configuring system-internal-tls -restart_pod ${SYSTEM_NAMESPACE} "app=activator" - -kubectl get cm "config-gc" -n "${SYSTEM_NAMESPACE}" -o yaml > "${TMP_DIR}"/config-gc.yaml -add_trap "kubectl replace cm 'config-gc' -n ${SYSTEM_NAMESPACE} -f ${TMP_DIR}/config-gc.yaml" SIGKILL SIGTERM SIGQUIT -immediate_gc -go_test_e2e -timeout=2m ./test/e2e/gc ${E2E_TEST_FLAGS} || failed=1 -kubectl replace cm "config-gc" -n ${SYSTEM_NAMESPACE} -f "${TMP_DIR}"/config-gc.yaml - -# Run scale tests. -# Note that we use a very high -parallel because each ksvc is run as its own -# sub-test. If this is not larger than the maximum scale tested then the test -# simply cannot pass. -# TODO - Renable once we get this reliably passing on GKE 1.21 -# go_test_e2e -timeout=20m -parallel=300 ./test/scale ${E2E_TEST_FLAGS} || failed=1 - -# Run HPA tests -go_test_e2e -timeout=30m -tags=hpa ./test/e2e ${E2E_TEST_FLAGS} || failed=1 - -# Run initContainers tests with alpha enabled avoiding any issues with the testing options guard above -# InitContainers test uses emptyDir. -toggle_feature kubernetes.podspec-init-containers Enabled -go_test_e2e -timeout=2m ./test/e2e/initcontainers ${E2E_TEST_FLAGS} || failed=1 -toggle_feature kubernetes.podspec-init-containers Disabled - -# RUN PVC tests with default storage class. -toggle_feature kubernetes.podspec-persistent-volume-claim Enabled -toggle_feature kubernetes.podspec-persistent-volume-write Enabled -toggle_feature kubernetes.podspec-securitycontext Enabled -go_test_e2e -timeout=5m ./test/e2e/pvc ${E2E_TEST_FLAGS} || failed=1 -toggle_feature kubernetes.podspec-securitycontext Disabled -toggle_feature kubernetes.podspec-persistent-volume-write Disabled -toggle_feature kubernetes.podspec-persistent-volume-claim Disabled - -# RUN secure pod defaults test in a separate install. -toggle_feature secure-pod-defaults Enabled -go_test_e2e -timeout=3m ./test/e2e/securedefaults ${E2E_TEST_FLAGS} || failed=1 -toggle_feature secure-pod-defaults Disabled - -# Run HA tests separately as they're stopping core Knative Serving pods. -# Define short -spoofinterval to ensure frequent probing while stopping pods. -go_test_e2e -timeout=25m -failfast -parallel=1 ./test/ha \ - ${E2E_TEST_FLAGS} \ - -replicas="${REPLICAS:-1}" \ - -buckets="${BUCKETS:-1}" \ - -spoofinterval="10ms" || failed=1 - -if (( HTTPS )); then - kubectl delete -f ${E2E_YAML_DIR}/test/config/externaldomaintls/certmanager/caissuer/ --ignore-not-found - toggle_feature external-domain-tls Disabled config-network -fi +# toggle_feature tag-header-based-routing Enabled +# go_test_e2e -timeout=2m ./test/e2e/tagheader ${E2E_TEST_FLAGS} || failed=1 +# toggle_feature tag-header-based-routing Disabled + +# toggle_feature allow-zero-initial-scale true config-autoscaler || fail_test +# go_test_e2e -timeout=2m ./test/e2e/initscale ${E2E_TEST_FLAGS} || failed=1 +# toggle_feature allow-zero-initial-scale false config-autoscaler || fail_test + +# go_test_e2e -timeout=2m ./test/e2e/domainmapping ${E2E_TEST_FLAGS} || failed=1 + +# toggle_feature cluster-local-domain-tls enabled config-network || fail_test +# go_test_e2e -timeout=2m ./test/e2e/clusterlocaldomaintls ${E2E_TEST_FLAGS} || failed=1 +# toggle_feature cluster-local-domain-tls disabled config-network || fail_test + +# toggle_feature system-internal-tls enabled config-network || fail_test +# toggle_feature "logging.enable-request-log" true config-observability || fail_test +# toggle_feature "logging.request-log-template" "TLS: {{.Request.TLS}}" config-observability || fail_test +# # with current implementation, Activator must be restarted when configuring system-internal-tls. See https://github.com/knative/serving/issues/13754 +# restart_pod ${SYSTEM_NAMESPACE} "app=activator" +# go_test_e2e -timeout=2m ./test/e2e/systeminternaltls ${E2E_TEST_FLAGS} || failed=1 +# toggle_feature system-internal-tls disabled config-network || fail_test +# toggle_feature enable-request-log false config-observability || fail_test +# toggle_feature request-log-template '' config-observability || fail_test +# # with the current implementation, Activator is always in the request path, and needs to be restarted after configuring system-internal-tls +# restart_pod ${SYSTEM_NAMESPACE} "app=activator" + +# kubectl get cm "config-gc" -n "${SYSTEM_NAMESPACE}" -o yaml > "${TMP_DIR}"/config-gc.yaml +# add_trap "kubectl replace cm 'config-gc' -n ${SYSTEM_NAMESPACE} -f ${TMP_DIR}/config-gc.yaml" SIGKILL SIGTERM SIGQUIT +# immediate_gc +# go_test_e2e -timeout=2m ./test/e2e/gc ${E2E_TEST_FLAGS} || failed=1 +# kubectl replace cm "config-gc" -n ${SYSTEM_NAMESPACE} -f "${TMP_DIR}"/config-gc.yaml + +# # Run scale tests. +# # Note that we use a very high -parallel because each ksvc is run as its own +# # sub-test. If this is not larger than the maximum scale tested then the test +# # simply cannot pass. +# # TODO - Renable once we get this reliably passing on GKE 1.21 +# # go_test_e2e -timeout=20m -parallel=300 ./test/scale ${E2E_TEST_FLAGS} || failed=1 + +# # Run HPA tests +# go_test_e2e -timeout=30m -tags=hpa ./test/e2e ${E2E_TEST_FLAGS} || failed=1 + +# # Run initContainers tests with alpha enabled avoiding any issues with the testing options guard above +# # InitContainers test uses emptyDir. +# toggle_feature kubernetes.podspec-init-containers Enabled +# go_test_e2e -timeout=2m ./test/e2e/initcontainers ${E2E_TEST_FLAGS} || failed=1 +# toggle_feature kubernetes.podspec-init-containers Disabled + +# # RUN PVC tests with default storage class. +# toggle_feature kubernetes.podspec-persistent-volume-claim Enabled +# toggle_feature kubernetes.podspec-persistent-volume-write Enabled +# toggle_feature kubernetes.podspec-securitycontext Enabled +# go_test_e2e -timeout=5m ./test/e2e/pvc ${E2E_TEST_FLAGS} || failed=1 +# toggle_feature kubernetes.podspec-securitycontext Disabled +# toggle_feature kubernetes.podspec-persistent-volume-write Disabled +# toggle_feature kubernetes.podspec-persistent-volume-claim Disabled + +# # RUN secure pod defaults test in a separate install. +# toggle_feature secure-pod-defaults Enabled +# go_test_e2e -timeout=3m ./test/e2e/securedefaults ${E2E_TEST_FLAGS} || failed=1 +# toggle_feature secure-pod-defaults Disabled + +# # Run HA tests separately as they're stopping core Knative Serving pods. +# # Define short -spoofinterval to ensure frequent probing while stopping pods. +# go_test_e2e -timeout=25m -failfast -parallel=1 ./test/ha \ +# ${E2E_TEST_FLAGS} \ +# -replicas="${REPLICAS:-1}" \ +# -buckets="${BUCKETS:-1}" \ +# -spoofinterval="10ms" || failed=1 + +# if (( HTTPS )); then +# kubectl delete -f ${E2E_YAML_DIR}/test/config/externaldomaintls/certmanager/caissuer/ --ignore-not-found +# toggle_feature external-domain-tls Disabled config-network +# fi (( failed )) && fail_test From 1fb98cd2805e08bfc5927e3a57556c6cf8252d4c Mon Sep 17 00:00:00 2001 From: dprotaso Date: Fri, 16 Feb 2024 16:46:21 -0500 Subject: [PATCH 3/4] figure out why test isn't failing here --- test/e2e/defective_revision_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/e2e/defective_revision_test.go b/test/e2e/defective_revision_test.go index d9931d6368ec..c8e4e8757a1f 100644 --- a/test/e2e/defective_revision_test.go +++ b/test/e2e/defective_revision_test.go @@ -174,4 +174,6 @@ func TestDefectiveRevisionScalesDown(t *testing.T) { if err = WaitForScaleToZero(t, rev1DeploymentName, clients); err != nil { t.Fatal("first revision failed to scale down") } + + t.Fatal("it shouldn't fail here") } From dd77d912894c949bf58c8619efdde6cfbe30d248 Mon Sep 17 00:00:00 2001 From: Dave Protasowski Date: Sat, 17 Feb 2024 10:12:56 -0500 Subject: [PATCH 4/4] debug --- test/e2e-tests.sh | 5 ++--- test/e2e/defective_revision_test.go | 31 ++++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/test/e2e-tests.sh b/test/e2e-tests.sh index 815cd9b0ef65..85cd4eb1d748 100755 --- a/test/e2e-tests.sh +++ b/test/e2e-tests.sh @@ -36,7 +36,7 @@ header "Running tests" failed=0 # Run tests serially in the mesh and https scenarios. -GO_TEST_FLAGS="-v -run TestDefect" +GO_TEST_FLAGS="-failfast -run TestDefect" E2E_TEST_FLAGS="${TEST_OPTIONS}" if [ -z "${E2E_TEST_FLAGS}" ]; then @@ -63,8 +63,7 @@ if (( SHORT )); then GO_TEST_FLAGS+=" -short" fi - -go_test_e2e -timeout=30m \ +go_test_e2e -timeout=60m \ ${GO_TEST_FLAGS} \ ./test/conformance/api/... \ ./test/conformance/runtime/... \ diff --git a/test/e2e/defective_revision_test.go b/test/e2e/defective_revision_test.go index c8e4e8757a1f..7bd9cc2ff2f1 100644 --- a/test/e2e/defective_revision_test.go +++ b/test/e2e/defective_revision_test.go @@ -22,6 +22,7 @@ package e2e import ( "context" "errors" + "fmt" "net/http" "testing" "time" @@ -39,8 +40,16 @@ import ( revisionnames "knative.dev/serving/pkg/reconciler/revision/resources/names" ) -func TestDefectiveRevisionScalesDown(t *testing.T) { +func TestDefectiveRevision(t *testing.T) { t.Parallel() + for i := 0; i < 10; i++ { + t.Run(fmt.Sprintf("%v", i), func(t *testing.T) { + runtest(t) + }) + } +} + +func runtest(t *testing.T) { clients := Setup(t) // This one uses the default namespace `test.ServingFlags.TestNamespace` resources := test.ResourceNames{ @@ -142,7 +151,7 @@ func TestDefectiveRevisionScalesDown(t *testing.T) { clients.KubeClient, rev1DeploymentName, func(d *appsv1.Deployment) (bool, error) { - // Fail if there's an error making the request + // Fail fast if there's an error making the request select { case requestErr = <-errs: return false, requestErr @@ -156,13 +165,14 @@ func TestDefectiveRevisionScalesDown(t *testing.T) { ) if requestErr != nil { - t.Fatal("triggering request to revision failed: ", err) + t.Fatal("triggering request to revision failed: ", requestErr) } if err != nil { t.Fatal("failed to wait for deployment to scale up: ", err) } + // trigger a rollout of a new revision if _, err = v1test.UpdateService(t, clients, resources, serviceFunc); err != nil { t.Fatal("failed to update service: ", err) } @@ -171,9 +181,20 @@ func TestDefectiveRevisionScalesDown(t *testing.T) { t.Fatal("failed to rollout new revision", err) } + // wait for the request to be complete prior to asserting scale to zero +outer: + for { + select { + case _, ok := <-errs: + if !ok { + break outer + } + case <-time.After(30 * time.Second): + t.Fatalf("request didn't complete") + } + } + if err = WaitForScaleToZero(t, rev1DeploymentName, clients); err != nil { t.Fatal("first revision failed to scale down") } - - t.Fatal("it shouldn't fail here") }