From 0bbe9fc9e644cbd6c09ed4af250d5504e5084359 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Mon, 15 Jun 2020 14:15:46 -0400 Subject: [PATCH] Measure container restarts over an interval instead of absolute We run these tests after disruptive events where restarts are necessary. Instead of looking at absolute restart count, capture the number of initial restarts and then compare subsequent runs to ensure the count is smaller. We double the time interval and halve the number of failing restarts in order to ensure we capture slow restart failures (full backoff is 5m, but we can catch slow but not limited backoff). --- test/extended/operators/cluster.go | 25 +++++++++++++++---- .../generated/zz_generated.annotations.go | 2 +- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/test/extended/operators/cluster.go b/test/extended/operators/cluster.go index 4351f9fd2142..764b8d7da649 100644 --- a/test/extended/operators/cluster.go +++ b/test/extended/operators/cluster.go @@ -18,10 +18,11 @@ import ( var _ = g.Describe("[Feature:Platform] Managed cluster should", func() { defer g.GinkgoRecover() - g.It("have no crashlooping pods in core namespaces over two minutes", func() { + g.It("have no crashlooping pods in core namespaces over four minutes", func() { c, err := e2e.LoadClientset() o.Expect(err).NotTo(o.HaveOccurred()) + restartingContainers := make(map[containerName]int) podsWithProblems := make(map[string]*corev1.Pod) var lastPending map[string]*corev1.Pod wait.PollImmediate(5*time.Second, 2*time.Minute, func() (bool, error) { @@ -63,7 +64,7 @@ var _ = g.Describe("[Feature:Platform] Managed cluster should", func() { case hasCreateContainerError(pod): case hasImagePullError(pod): case isCrashLooping(pod): - case hasExcessiveRestarts(pod): + case hasExcessiveRestarts(pod, 2, restartingContainers): case hasFailingContainer(pod): default: continue @@ -172,10 +173,24 @@ func isCrashLooping(pod *corev1.Pod) bool { return false } -func hasExcessiveRestarts(pod *corev1.Pod) bool { +type containerName struct { + namespace string + name string + container string +} + +func hasExcessiveRestarts(pod *corev1.Pod, excessiveCount int, counts map[containerName]int) bool { for _, status := range append(append([]corev1.ContainerStatus{}, pod.Status.InitContainerStatuses...), pod.Status.ContainerStatuses...) { - if status.RestartCount > 5 { - pod.Status.Message = fmt.Sprintf("container %s has restarted more than 5 times", status.Name) + name := containerName{namespace: pod.Namespace, name: pod.Name, container: status.Name} + count, ok := counts[name] + if !ok { + counts[name] = int(status.RestartCount) + continue + } + + current := int(status.RestartCount) - count + if current >= excessiveCount { + pod.Status.Message = fmt.Sprintf("container %s has restarted %d times (>= %d) within the allowed interval", status.Name, current, excessiveCount) return true } } diff --git a/test/extended/util/annotate/generated/zz_generated.annotations.go b/test/extended/util/annotate/generated/zz_generated.annotations.go index 4192bef6edfc..eb2df04bc5e8 100644 --- a/test/extended/util/annotate/generated/zz_generated.annotations.go +++ b/test/extended/util/annotate/generated/zz_generated.annotations.go @@ -330,7 +330,7 @@ var annotations = map[string]string{ "[Top Level] [Feature:Platform] Managed cluster should ensure control plane operators do not make themselves unevictable": "[Top Level] [Feature:Platform] Managed cluster should ensure control plane operators do not make themselves unevictable [Skipped:ibmcloud] [Suite:openshift/conformance/parallel]", "[Top Level] [Feature:Platform] Managed cluster should ensure control plane pods do not run in best-effort QoS": "[Top Level] [Feature:Platform] Managed cluster should ensure control plane pods do not run in best-effort QoS [Skipped:ibmcloud] [Suite:openshift/conformance/parallel]", "[Top Level] [Feature:Platform] Managed cluster should ensure pods use downstream images from our release image with proper ImagePullPolicy": "[Top Level] [Feature:Platform] Managed cluster should ensure pods use downstream images from our release image with proper ImagePullPolicy [Suite:openshift/conformance/parallel]", - "[Top Level] [Feature:Platform] Managed cluster should have no crashlooping pods in core namespaces over two minutes": "[Top Level] [Feature:Platform] Managed cluster should have no crashlooping pods in core namespaces over two minutes [Suite:openshift/conformance/parallel]", + "[Top Level] [Feature:Platform] Managed cluster should have no crashlooping pods in core namespaces over four minutes": "[Top Level] [Feature:Platform] Managed cluster should have no crashlooping pods in core namespaces over four minutes [Suite:openshift/conformance/parallel]", "[Top Level] [Feature:Platform] Managed cluster should have operators on the cluster version": "[Top Level] [Feature:Platform] Managed cluster should have operators on the cluster version [Suite:openshift/conformance/parallel]", "[Top Level] [Feature:Platform] Managed cluster should recover when operator-owned objects are deleted [Disruptive]": "[Top Level] [Feature:Platform] Managed cluster should recover when operator-owned objects are deleted [Disruptive] [Serial] [Suite:openshift]", "[Top Level] [Feature:Platform] Managed cluster should should expose cluster services outside the cluster": "[Top Level] [Feature:Platform] Managed cluster should should expose cluster services outside the cluster [Suite:openshift/conformance/parallel]",