From 0bbe9fc9e644cbd6c09ed4af250d5504e5084359 Mon Sep 17 00:00:00 2001
From: Clayton Coleman <ccoleman@redhat.com>
Date: Mon, 15 Jun 2020 14:15:46 -0400
Subject: [PATCH] Measure container restarts over an interval instead of
 absolute

We run these tests after disruptive events where restarts are
necessary. Instead of looking at absolute restart count, capture
the number of initial restarts and then compare subsequent runs
to ensure the count is smaller. We double the time interval and
halve the number of failing restarts in order to ensure we capture
slow restart failures (full backoff is 5m, but we can catch slow
but not limited backoff).
---
 test/extended/operators/cluster.go            | 25 +++++++++++++++----
 .../generated/zz_generated.annotations.go     |  2 +-
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/test/extended/operators/cluster.go b/test/extended/operators/cluster.go
index 4351f9fd2142..764b8d7da649 100644
--- a/test/extended/operators/cluster.go
+++ b/test/extended/operators/cluster.go
@@ -18,10 +18,11 @@ import (
 var _ = g.Describe("[Feature:Platform] Managed cluster should", func() {
 	defer g.GinkgoRecover()
 
-	g.It("have no crashlooping pods in core namespaces over two minutes", func() {
+	g.It("have no crashlooping pods in core namespaces over four minutes", func() {
 		c, err := e2e.LoadClientset()
 		o.Expect(err).NotTo(o.HaveOccurred())
 
+		restartingContainers := make(map[containerName]int)
 		podsWithProblems := make(map[string]*corev1.Pod)
 		var lastPending map[string]*corev1.Pod
 		wait.PollImmediate(5*time.Second, 2*time.Minute, func() (bool, error) {
@@ -63,7 +64,7 @@ var _ = g.Describe("[Feature:Platform] Managed cluster should", func() {
 				case hasCreateContainerError(pod):
 				case hasImagePullError(pod):
 				case isCrashLooping(pod):
-				case hasExcessiveRestarts(pod):
+				case hasExcessiveRestarts(pod, 2, restartingContainers):
 				case hasFailingContainer(pod):
 				default:
 					continue
@@ -172,10 +173,24 @@ func isCrashLooping(pod *corev1.Pod) bool {
 	return false
 }
 
-func hasExcessiveRestarts(pod *corev1.Pod) bool {
+type containerName struct {
+	namespace string
+	name      string
+	container string
+}
+
+func hasExcessiveRestarts(pod *corev1.Pod, excessiveCount int, counts map[containerName]int) bool {
 	for _, status := range append(append([]corev1.ContainerStatus{}, pod.Status.InitContainerStatuses...), pod.Status.ContainerStatuses...) {
-		if status.RestartCount > 5 {
-			pod.Status.Message = fmt.Sprintf("container %s has restarted more than 5 times", status.Name)
+		name := containerName{namespace: pod.Namespace, name: pod.Name, container: status.Name}
+		count, ok := counts[name]
+		if !ok {
+			counts[name] = int(status.RestartCount)
+			continue
+		}
+
+		current := int(status.RestartCount) - count
+		if current >= excessiveCount {
+			pod.Status.Message = fmt.Sprintf("container %s has restarted %d times (>= %d) within the allowed interval", status.Name, current, excessiveCount)
 			return true
 		}
 	}
diff --git a/test/extended/util/annotate/generated/zz_generated.annotations.go b/test/extended/util/annotate/generated/zz_generated.annotations.go
index 4192bef6edfc..eb2df04bc5e8 100644
--- a/test/extended/util/annotate/generated/zz_generated.annotations.go
+++ b/test/extended/util/annotate/generated/zz_generated.annotations.go
@@ -330,7 +330,7 @@ var annotations = map[string]string{
 	"[Top Level] [Feature:Platform] Managed cluster should ensure control plane operators do not make themselves unevictable":                                                                                                                             "[Top Level] [Feature:Platform] Managed cluster should ensure control plane operators do not make themselves unevictable [Skipped:ibmcloud] [Suite:openshift/conformance/parallel]",
 	"[Top Level] [Feature:Platform] Managed cluster should ensure control plane pods do not run in best-effort QoS":                                                                                                                                       "[Top Level] [Feature:Platform] Managed cluster should ensure control plane pods do not run in best-effort QoS [Skipped:ibmcloud] [Suite:openshift/conformance/parallel]",
 	"[Top Level] [Feature:Platform] Managed cluster should ensure pods use downstream images from our release image with proper ImagePullPolicy":                                                                                                          "[Top Level] [Feature:Platform] Managed cluster should ensure pods use downstream images from our release image with proper ImagePullPolicy [Suite:openshift/conformance/parallel]",
-	"[Top Level] [Feature:Platform] Managed cluster should have no crashlooping pods in core namespaces over two minutes":                                                                                                                                 "[Top Level] [Feature:Platform] Managed cluster should have no crashlooping pods in core namespaces over two minutes [Suite:openshift/conformance/parallel]",
+	"[Top Level] [Feature:Platform] Managed cluster should have no crashlooping pods in core namespaces over four minutes":                                                                                                                                "[Top Level] [Feature:Platform] Managed cluster should have no crashlooping pods in core namespaces over four minutes [Suite:openshift/conformance/parallel]",
 	"[Top Level] [Feature:Platform] Managed cluster should have operators on the cluster version":                                                                                                                                                         "[Top Level] [Feature:Platform] Managed cluster should have operators on the cluster version [Suite:openshift/conformance/parallel]",
 	"[Top Level] [Feature:Platform] Managed cluster should recover when operator-owned objects are deleted [Disruptive]":                                                                                                                                  "[Top Level] [Feature:Platform] Managed cluster should recover when operator-owned objects are deleted [Disruptive] [Serial] [Suite:openshift]",
 	"[Top Level] [Feature:Platform] Managed cluster should should expose cluster services outside the cluster":                                                                                                                                            "[Top Level] [Feature:Platform] Managed cluster should should expose cluster services outside the cluster [Suite:openshift/conformance/parallel]",