From d4b33578dbee164e536dd58c5a55855b91220742 Mon Sep 17 00:00:00 2001
From: Clayton Coleman <ccoleman@redhat.com>
Date: Mon, 15 Jun 2020 14:15:46 -0400
Subject: [PATCH] Measure container restarts over an interval instead of
 absolute

We run these tests after disruptive events where restarts are
necessary. Instead of looking at absolute restart count, capture
the number of initial restarts and then compare subsequent runs
to ensure the count is smaller. We double the time interval and
halve the number of failing restarts in order to ensure we capture
slow restart failures (full backoff is 5m, but we can catch slow
but not limited backoff).
---
 test/extended/operators/cluster.go            | 27 ++++++++++++++-----
 .../generated/zz_generated.annotations.go     |  2 +-
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/test/extended/operators/cluster.go b/test/extended/operators/cluster.go
index 37a560d3537b..5373333771e6 100644
--- a/test/extended/operators/cluster.go
+++ b/test/extended/operators/cluster.go
@@ -19,13 +19,14 @@ import (
 var _ = g.Describe("[sig-arch] Managed cluster should", func() {
 	defer g.GinkgoRecover()
 
-	g.It("have no crashlooping pods in core namespaces over two minutes", func() {
+	g.It("have no crashlooping pods in core namespaces over four minutes", func() {
 		c, err := e2e.LoadClientset()
 		o.Expect(err).NotTo(o.HaveOccurred())
 
+		restartingContainers := make(map[containerName]int)
 		podsWithProblems := make(map[string]*corev1.Pod)
 		var lastPending map[string]*corev1.Pod
-		wait.PollImmediate(5*time.Second, 2*time.Minute, func() (bool, error) {
+		wait.PollImmediate(5*time.Second, 4*time.Minute, func() (bool, error) {
 			allPods, err := c.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{})
 			o.Expect(err).NotTo(o.HaveOccurred())
 
@@ -64,7 +65,7 @@ var _ = g.Describe("[sig-arch] Managed cluster should", func() {
 				case hasCreateContainerError(pod):
 				case hasImagePullError(pod):
 				case isCrashLooping(pod):
-				case hasExcessiveRestarts(pod):
+				case hasExcessiveRestarts(pod, 2, restartingContainers):
 				case hasFailingContainer(pod):
 				default:
 					continue
@@ -173,10 +174,24 @@ func isCrashLooping(pod *corev1.Pod) bool {
 	return false
 }
 
-func hasExcessiveRestarts(pod *corev1.Pod) bool {
+type containerName struct {
+	namespace string
+	name      string
+	container string
+}
+
+func hasExcessiveRestarts(pod *corev1.Pod, excessiveCount int, counts map[containerName]int) bool {
 	for _, status := range append(append([]corev1.ContainerStatus{}, pod.Status.InitContainerStatuses...), pod.Status.ContainerStatuses...) {
-		if status.RestartCount > 5 {
-			pod.Status.Message = fmt.Sprintf("container %s has restarted more than 5 times", status.Name)
+		name := containerName{namespace: pod.Namespace, name: pod.Name, container: status.Name}
+		count, ok := counts[name]
+		if !ok {
+			counts[name] = int(status.RestartCount)
+			continue
+		}
+
+		current := int(status.RestartCount) - count
+		if current >= excessiveCount {
+			pod.Status.Message = fmt.Sprintf("container %s has restarted %d times (>= %d) within the allowed interval", status.Name, current, excessiveCount)
 			return true
 		}
 	}
diff --git a/test/extended/util/annotate/generated/zz_generated.annotations.go b/test/extended/util/annotate/generated/zz_generated.annotations.go
index 12ac908ebd92..0e5c1d417415 100644
--- a/test/extended/util/annotate/generated/zz_generated.annotations.go
+++ b/test/extended/util/annotate/generated/zz_generated.annotations.go
@@ -376,7 +376,7 @@ var annotations = map[string]string{
 	"[Top Level] [sig-arch] Managed cluster should ensure control plane operators do not make themselves unevictable":                                                                                                                         "ensure control plane operators do not make themselves unevictable [Suite:openshift/conformance/parallel]",
 	"[Top Level] [sig-arch] Managed cluster should ensure control plane pods do not run in best-effort QoS":                                                                                                                                   "ensure control plane pods do not run in best-effort QoS [Suite:openshift/conformance/parallel]",
 	"[Top Level] [sig-arch] Managed cluster should ensure pods use downstream images from our release image with proper ImagePullPolicy":                                                                                                      "should ensure pods use downstream images from our release image with proper ImagePullPolicy [Suite:openshift/conformance/parallel]",
-	"[Top Level] [sig-arch] Managed cluster should have no crashlooping pods in core namespaces over two minutes":                                                                                                                             "have no crashlooping pods in core namespaces over two minutes [Suite:openshift/conformance/parallel]",
+	"[Top Level] [sig-arch] Managed cluster should have no crashlooping pods in core namespaces over four minutes":                                                                                                                            "have no crashlooping pods in core namespaces over four minutes [Suite:openshift/conformance/parallel]",
 	"[Top Level] [sig-arch] Managed cluster should have operators on the cluster version":                                                                                                                                                     "have operators on the cluster version [Suite:openshift/conformance/parallel]",
 	"[Top Level] [sig-arch] Managed cluster should recover when operator-owned objects are deleted [Disruptive]":                                                                                                                              "when operator-owned objects are deleted [Disruptive] [Serial] [Suite:openshift]",
 	"[Top Level] [sig-arch] Managed cluster should should expose cluster services outside the cluster":                                                                                                                                        "should expose cluster services outside the cluster [Suite:openshift/conformance/parallel]",