From d4b33578dbee164e536dd58c5a55855b91220742 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Mon, 15 Jun 2020 14:15:46 -0400 Subject: [PATCH] Measure container restarts over an interval instead of absolute We run these tests after disruptive events where restarts are necessary. Instead of looking at absolute restart count, capture the number of initial restarts and then compare subsequent runs to ensure the count is smaller. We double the time interval and halve the number of failing restarts in order to ensure we capture slow restart failures (full backoff is 5m, but we can catch slow but not limited backoff). --- test/extended/operators/cluster.go | 27 ++++++++++++++----- .../generated/zz_generated.annotations.go | 2 +- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/test/extended/operators/cluster.go b/test/extended/operators/cluster.go index 37a560d3537b..5373333771e6 100644 --- a/test/extended/operators/cluster.go +++ b/test/extended/operators/cluster.go @@ -19,13 +19,14 @@ import ( var _ = g.Describe("[sig-arch] Managed cluster should", func() { defer g.GinkgoRecover() - g.It("have no crashlooping pods in core namespaces over two minutes", func() { + g.It("have no crashlooping pods in core namespaces over four minutes", func() { c, err := e2e.LoadClientset() o.Expect(err).NotTo(o.HaveOccurred()) + restartingContainers := make(map[containerName]int) podsWithProblems := make(map[string]*corev1.Pod) var lastPending map[string]*corev1.Pod - wait.PollImmediate(5*time.Second, 2*time.Minute, func() (bool, error) { + wait.PollImmediate(5*time.Second, 4*time.Minute, func() (bool, error) { allPods, err := c.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{}) o.Expect(err).NotTo(o.HaveOccurred()) @@ -64,7 +65,7 @@ var _ = g.Describe("[sig-arch] Managed cluster should", func() { case hasCreateContainerError(pod): case hasImagePullError(pod): case isCrashLooping(pod): - case hasExcessiveRestarts(pod): + case hasExcessiveRestarts(pod, 2, restartingContainers): case hasFailingContainer(pod): default: continue @@ -173,10 +174,24 @@ func isCrashLooping(pod *corev1.Pod) bool { return false } -func hasExcessiveRestarts(pod *corev1.Pod) bool { +type containerName struct { + namespace string + name string + container string +} + +func hasExcessiveRestarts(pod *corev1.Pod, excessiveCount int, counts map[containerName]int) bool { for _, status := range append(append([]corev1.ContainerStatus{}, pod.Status.InitContainerStatuses...), pod.Status.ContainerStatuses...) { - if status.RestartCount > 5 { - pod.Status.Message = fmt.Sprintf("container %s has restarted more than 5 times", status.Name) + name := containerName{namespace: pod.Namespace, name: pod.Name, container: status.Name} + count, ok := counts[name] + if !ok { + counts[name] = int(status.RestartCount) + continue + } + + current := int(status.RestartCount) - count + if current >= excessiveCount { + pod.Status.Message = fmt.Sprintf("container %s has restarted %d times (>= %d) within the allowed interval", status.Name, current, excessiveCount) return true } } diff --git a/test/extended/util/annotate/generated/zz_generated.annotations.go b/test/extended/util/annotate/generated/zz_generated.annotations.go index 12ac908ebd92..0e5c1d417415 100644 --- a/test/extended/util/annotate/generated/zz_generated.annotations.go +++ b/test/extended/util/annotate/generated/zz_generated.annotations.go @@ -376,7 +376,7 @@ var annotations = map[string]string{ "[Top Level] [sig-arch] Managed cluster should ensure control plane operators do not make themselves unevictable": "ensure control plane operators do not make themselves unevictable [Suite:openshift/conformance/parallel]", "[Top Level] [sig-arch] Managed cluster should ensure control plane pods do not run in best-effort QoS": "ensure control plane pods do not run in best-effort QoS [Suite:openshift/conformance/parallel]", "[Top Level] [sig-arch] Managed cluster should ensure pods use downstream images from our release image with proper ImagePullPolicy": "should ensure pods use downstream images from our release image with proper ImagePullPolicy [Suite:openshift/conformance/parallel]", - "[Top Level] [sig-arch] Managed cluster should have no crashlooping pods in core namespaces over two minutes": "have no crashlooping pods in core namespaces over two minutes [Suite:openshift/conformance/parallel]", + "[Top Level] [sig-arch] Managed cluster should have no crashlooping pods in core namespaces over four minutes": "have no crashlooping pods in core namespaces over four minutes [Suite:openshift/conformance/parallel]", "[Top Level] [sig-arch] Managed cluster should have operators on the cluster version": "have operators on the cluster version [Suite:openshift/conformance/parallel]", "[Top Level] [sig-arch] Managed cluster should recover when operator-owned objects are deleted [Disruptive]": "when operator-owned objects are deleted [Disruptive] [Serial] [Suite:openshift]", "[Top Level] [sig-arch] Managed cluster should should expose cluster services outside the cluster": "should expose cluster services outside the cluster [Suite:openshift/conformance/parallel]",