From c1679b352d94492ee41a1c805544bd2231457842 Mon Sep 17 00:00:00 2001
From: Mike Dame <mdame@redhat.com>
Date: Mon, 4 May 2020 16:07:50 -0400
Subject: [PATCH 1/2] UPSTREAM: 90740: Balance node usage before creating
 victim pods in preemption e2e

---
 .../test/e2e/scheduling/preemption.go         | 100 ++++++++++--------
 .../test/e2e/scheduling/priorities.go         |  32 ++++--
 2 files changed, 78 insertions(+), 54 deletions(-)

diff --git a/vendor/k8s.io/kubernetes/test/e2e/scheduling/preemption.go b/vendor/k8s.io/kubernetes/test/e2e/scheduling/preemption.go
index 50f3a26f1b38..821b0d21750f 100644
--- a/vendor/k8s.io/kubernetes/test/e2e/scheduling/preemption.go
+++ b/vendor/k8s.io/kubernetes/test/e2e/scheduling/preemption.go
@@ -55,6 +55,8 @@ type priorityPair struct {
 	value int32
 }
 
+var testExtendedResource = v1.ResourceName("scheduling.k8s.io/foo")
+
 var _ = SIGDescribe("SchedulerPreemption [Serial]", func() {
 	var cs clientset.Interface
 	var nodeList *v1.NodeList
@@ -75,6 +77,10 @@ var _ = SIGDescribe("SchedulerPreemption [Serial]", func() {
 		for _, pair := range priorityPairs {
 			cs.SchedulingV1().PriorityClasses().Delete(context.TODO(), pair.name, *metav1.NewDeleteOptions(0))
 		}
+		for _, node := range nodeList.Items {
+			delete(node.Status.Capacity, testExtendedResource)
+			cs.CoreV1().Nodes().UpdateStatus(context.TODO(), &node, metav1.UpdateOptions{})
+		}
 	})
 
 	ginkgo.BeforeEach(func() {
@@ -103,30 +109,20 @@ var _ = SIGDescribe("SchedulerPreemption [Serial]", func() {
 	// the high priority pod.
 	ginkgo.It("validates basic preemption works", func() {
 		var podRes v1.ResourceList
+
 		// Create one pod per node that uses a lot of the node's resources.
 		ginkgo.By("Create pods that use 60% of node resources.")
 		pods := make([]*v1.Pod, 0, len(nodeList.Items))
-		allPods, err := cs.CoreV1().Pods(metav1.NamespaceAll).List(context.TODO(), metav1.ListOptions{})
-		framework.ExpectNoError(err)
+		// Now create victim pods on each of the node with lower priority
 		for i, node := range nodeList.Items {
-			currentCPUUsage, currentMemUsage := getCurrentPodUsageOnTheNode(node.Name, allPods.Items, podRequestedResource)
-			framework.Logf("Current cpu and memory usage %v, %v", currentCPUUsage, currentMemUsage)
-			cpuAllocatable, found := node.Status.Allocatable["cpu"]
-			framework.ExpectEqual(found, true)
-			milliCPU := cpuAllocatable.MilliValue()
-			milliCPU = int64(float64(milliCPU-currentCPUUsage) * float64(0.6))
-			memAllocatable, found := node.Status.Allocatable["memory"]
-			framework.ExpectEqual(found, true)
-			memory := memAllocatable.Value()
-			memory = int64(float64(memory-currentMemUsage) * float64(0.6))
-			// If a node is already heavily utilized let not's create a pod there.
-			if milliCPU <= 0 || memory <= 0 {
-				framework.Logf("Node is heavily utilized, let's not create a pod here")
-				continue
-			}
+			// Update each node to advertise 3 available extended resources
+			node.Status.Capacity[testExtendedResource] = resource.MustParse("3")
+			node, err := cs.CoreV1().Nodes().UpdateStatus(context.TODO(), &node, metav1.UpdateOptions{})
+			framework.ExpectNoError(err)
+
+			// Request 2 of the available resources for the victim pods
 			podRes = v1.ResourceList{}
-			podRes[v1.ResourceCPU] = *resource.NewMilliQuantity(int64(milliCPU), resource.DecimalSI)
-			podRes[v1.ResourceMemory] = *resource.NewQuantity(int64(memory), resource.BinarySI)
+			podRes[testExtendedResource] = resource.MustParse("2")
 
 			// make the first pod low priority and the rest medium priority.
 			priorityName := mediumPriorityClassName
@@ -138,10 +134,23 @@ var _ = SIGDescribe("SchedulerPreemption [Serial]", func() {
 				PriorityClassName: priorityName,
 				Resources: &v1.ResourceRequirements{
 					Requests: podRes,
+					Limits:   podRes,
+				},
+				Affinity: &v1.Affinity{
+					NodeAffinity: &v1.NodeAffinity{
+						RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
+							NodeSelectorTerms: []v1.NodeSelectorTerm{
+								{
+									MatchFields: []v1.NodeSelectorRequirement{
+										{Key: "metadata.name", Operator: v1.NodeSelectorOpIn, Values: []string{node.Name}},
+									},
+								},
+							},
+						},
+					},
 				},
-				NodeName: node.Name,
 			}))
-			framework.Logf("Created pod: %v", pods[i].Name)
+			framework.Logf("Created pod: %v with resources: %+v", pods[i].Name, pods[i].Spec.Containers[0].Resources)
 		}
 		if len(pods) < 2 {
 			framework.Failf("We need at least two pods to be created but" +
@@ -162,8 +171,8 @@ var _ = SIGDescribe("SchedulerPreemption [Serial]", func() {
 			PriorityClassName: highPriorityClassName,
 			Resources: &v1.ResourceRequirements{
 				Requests: podRes,
+				Limits:   podRes,
 			},
-			NodeName: pods[0].Spec.NodeName,
 		})
 
 		preemptedPod, err := cs.CoreV1().Pods(pods[0].Namespace).Get(context.TODO(), pods[0].Name, metav1.GetOptions{})
@@ -174,7 +183,6 @@ var _ = SIGDescribe("SchedulerPreemption [Serial]", func() {
 			framework.ExpectNoError(err)
 			gomega.Expect(livePod.DeletionTimestamp).To(gomega.BeNil())
 		}
-
 		framework.ExpectEqual(podPreempted, true)
 	})
 
@@ -183,30 +191,19 @@ var _ = SIGDescribe("SchedulerPreemption [Serial]", func() {
 	// this critical pod.
 	ginkgo.It("validates lower priority pod preemption by critical pod", func() {
 		var podRes v1.ResourceList
+
 		// Create one pod per node that uses a lot of the node's resources.
-		ginkgo.By("Create pods that use 60% of node resources.")
+		ginkgo.By("Create pods that use 2/3 of node resources.")
 		pods := make([]*v1.Pod, 0, len(nodeList.Items))
-		allPods, err := cs.CoreV1().Pods(metav1.NamespaceAll).List(context.TODO(), metav1.ListOptions{})
-		framework.ExpectNoError(err)
 		for i, node := range nodeList.Items {
-			currentCPUUsage, currentMemUsage := getCurrentPodUsageOnTheNode(node.Name, allPods.Items, podRequestedResource)
-			framework.Logf("Current cpu usage and memory usage is %v, %v", currentCPUUsage, currentMemUsage)
-			cpuAllocatable, found := node.Status.Allocatable["cpu"]
-			framework.ExpectEqual(found, true)
-			milliCPU := cpuAllocatable.MilliValue()
-			milliCPU = int64(float64(milliCPU-currentCPUUsage) * float64(0.6))
-			memAllocatable, found := node.Status.Allocatable["memory"]
-			framework.ExpectEqual(found, true)
-			memory := memAllocatable.Value()
-			memory = int64(float64(memory-currentMemUsage) * float64(0.6))
+			// Update each node to advertise 3 available extended resources
+			node.Status.Capacity[testExtendedResource] = resource.MustParse("3")
+			node, err := cs.CoreV1().Nodes().UpdateStatus(context.TODO(), &node, metav1.UpdateOptions{})
+			framework.ExpectNoError(err)
+
+			// Request 2 of the available resources for the victim pods
 			podRes = v1.ResourceList{}
-			// If a node is already heavily utilized let not's create a pod there.
-			if milliCPU <= 0 || memory <= 0 {
-				framework.Logf("Node is heavily utilized, let's not create a pod there")
-				continue
-			}
-			podRes[v1.ResourceCPU] = *resource.NewMilliQuantity(int64(milliCPU), resource.DecimalSI)
-			podRes[v1.ResourceMemory] = *resource.NewQuantity(int64(memory), resource.BinarySI)
+			podRes[testExtendedResource] = resource.MustParse("2")
 
 			// make the first pod low priority and the rest medium priority.
 			priorityName := mediumPriorityClassName
@@ -218,8 +215,21 @@ var _ = SIGDescribe("SchedulerPreemption [Serial]", func() {
 				PriorityClassName: priorityName,
 				Resources: &v1.ResourceRequirements{
 					Requests: podRes,
+					Limits:   podRes,
+				},
+				Affinity: &v1.Affinity{
+					NodeAffinity: &v1.NodeAffinity{
+						RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
+							NodeSelectorTerms: []v1.NodeSelectorTerm{
+								{
+									MatchFields: []v1.NodeSelectorRequirement{
+										{Key: "metadata.name", Operator: v1.NodeSelectorOpIn, Values: []string{node.Name}},
+									},
+								},
+							},
+						},
+					},
 				},
-				NodeName: node.Name,
 			}))
 			framework.Logf("Created pod: %v", pods[i].Name)
 		}
@@ -250,8 +260,8 @@ var _ = SIGDescribe("SchedulerPreemption [Serial]", func() {
 			PriorityClassName: scheduling.SystemClusterCritical,
 			Resources: &v1.ResourceRequirements{
 				Requests: podRes,
+				Limits:   podRes,
 			},
-			NodeName: pods[0].Spec.NodeName,
 		})
 
 		defer func() {
diff --git a/vendor/k8s.io/kubernetes/test/e2e/scheduling/priorities.go b/vendor/k8s.io/kubernetes/test/e2e/scheduling/priorities.go
index e96ff7d0a14a..a661e6fcd878 100644
--- a/vendor/k8s.io/kubernetes/test/e2e/scheduling/priorities.go
+++ b/vendor/k8s.io/kubernetes/test/e2e/scheduling/priorities.go
@@ -465,16 +465,30 @@ func createBalancedPodForNodes(f *framework.Framework, cs clientset.Interface, n
 
 		needCreateResource[v1.ResourceMemory] = *resource.NewQuantity(int64((ratio-memFraction)*float64(memAllocatableVal)), resource.BinarySI)
 
-		err := testutils.StartPods(cs, 1, ns, string(uuid.NewUUID()),
-			*initPausePod(f, pausePodConfig{
-				Name:   "",
-				Labels: balancePodLabel,
-				Resources: &v1.ResourceRequirements{
-					Limits:   needCreateResource,
-					Requests: needCreateResource,
+		podConfig := &pausePodConfig{
+			Name:   "",
+			Labels: balancePodLabel,
+			Resources: &v1.ResourceRequirements{
+				Limits:   needCreateResource,
+				Requests: needCreateResource,
+			},
+			Affinity: &v1.Affinity{
+				NodeAffinity: &v1.NodeAffinity{
+					RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
+						NodeSelectorTerms: []v1.NodeSelectorTerm{
+							{
+								MatchFields: []v1.NodeSelectorRequirement{
+									{Key: "metadata.name", Operator: v1.NodeSelectorOpIn, Values: []string{node.Name}},
+								},
+							},
+						},
+					},
 				},
-				NodeName: node.Name,
-			}), true, framework.Logf)
+			},
+		}
+
+		err := testutils.StartPods(cs, 1, ns, string(uuid.NewUUID()),
+			*initPausePod(f, *podConfig), true, framework.Logf)
 
 		if err != nil {
 			return err

From c584cec9b3eb92c4887de7f40f931f6591c0a683 Mon Sep 17 00:00:00 2001
From: Mike Dame <mdame@redhat.com>
Date: Mon, 4 May 2020 16:11:57 -0400
Subject: [PATCH 2/2] Reenable preemption e2es

---
 .../util/annotate/generated/zz_generated.annotations.go   | 8 ++++----
 test/extended/util/annotate/rules.go                      | 3 ---
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/test/extended/util/annotate/generated/zz_generated.annotations.go b/test/extended/util/annotate/generated/zz_generated.annotations.go
index a8ed3edc39f8..6198f9e0a223 100644
--- a/test/extended/util/annotate/generated/zz_generated.annotations.go
+++ b/test/extended/util/annotate/generated/zz_generated.annotations.go
@@ -1148,10 +1148,10 @@ var annotations = map[string]string{
 	"[Top Level] [sig-scheduling] SchedulerPredicates [Serial] validates that taints-tolerations is respected if not matching":                                                                                                                                                                                 "validates that taints-tolerations is respected if not matching [Suite:openshift/conformance/serial] [Suite:k8s]",
 	"[Top Level] [sig-scheduling] SchedulerPredicates [Serial] validates that there exists conflict between pods with same hostPort and protocol but one using 0.0.0.0 hostIP [Conformance]":                                                                                                                   "validates that there exists conflict between pods with same hostPort and protocol but one using 0.0.0.0 hostIP [Conformance] [Slow] [Suite:k8s]",
 	"[Top Level] [sig-scheduling] SchedulerPredicates [Serial] validates that there is no conflict between pods with same hostPort but different hostIP and protocol [Conformance]":                                                                                                                            "validates that there is no conflict between pods with same hostPort but different hostIP and protocol [Conformance] [Suite:openshift/conformance/serial/minimal] [Suite:k8s]",
-	"[Top Level] [sig-scheduling] SchedulerPreemption [Serial] PodTopologySpread Preemption validates proper pods are preempted":                                                                                                                                                                               "validates proper pods are preempted [Disabled:Broken] [Suite:k8s]",
-	"[Top Level] [sig-scheduling] SchedulerPreemption [Serial] PreemptionExecutionPath runs ReplicaSets to verify preemption running path":                                                                                                                                                                     "runs ReplicaSets to verify preemption running path [Disabled:Broken] [Suite:k8s]",
-	"[Top Level] [sig-scheduling] SchedulerPreemption [Serial] validates basic preemption works":                                                                                                                                                                                                               "validates basic preemption works [Disabled:Broken] [Suite:k8s]",
-	"[Top Level] [sig-scheduling] SchedulerPreemption [Serial] validates lower priority pod preemption by critical pod":                                                                                                                                                                                        "validates lower priority pod preemption by critical pod [Disabled:Broken] [Suite:k8s]",
+	"[Top Level] [sig-scheduling] SchedulerPreemption [Serial] PodTopologySpread Preemption validates proper pods are preempted":                                                                                                                                                                               "validates proper pods are preempted [Suite:openshift/conformance/serial] [Suite:k8s]",
+	"[Top Level] [sig-scheduling] SchedulerPreemption [Serial] PreemptionExecutionPath runs ReplicaSets to verify preemption running path":                                                                                                                                                                     "runs ReplicaSets to verify preemption running path [Suite:openshift/conformance/serial] [Suite:k8s]",
+	"[Top Level] [sig-scheduling] SchedulerPreemption [Serial] validates basic preemption works":                                                                                                                                                                                                               "validates basic preemption works [Suite:openshift/conformance/serial] [Suite:k8s]",
+	"[Top Level] [sig-scheduling] SchedulerPreemption [Serial] validates lower priority pod preemption by critical pod":                                                                                                                                                                                        "validates lower priority pod preemption by critical pod [Suite:openshift/conformance/serial] [Suite:k8s]",
 	"[Top Level] [sig-scheduling] SchedulerPriorities [Serial] Pod should avoid nodes that have avoidPod annotation":                                                                                                                                                                                           "Pod should avoid nodes that have avoidPod annotation [Suite:openshift/conformance/serial] [Suite:k8s]",
 	"[Top Level] [sig-scheduling] SchedulerPriorities [Serial] Pod should be preferably scheduled to nodes pod can tolerate":                                                                                                                                                                                   "Pod should be preferably scheduled to nodes pod can tolerate [Suite:openshift/conformance/serial] [Suite:k8s]",
 	"[Top Level] [sig-scheduling] SchedulerPriorities [Serial] Pod should be scheduled to node that don't match the PodAntiAffinity terms":                                                                                                                                                                     "Pod should be scheduled to node that don't match the PodAntiAffinity terms [Suite:openshift/conformance/serial] [Suite:k8s]",
diff --git a/test/extended/util/annotate/rules.go b/test/extended/util/annotate/rules.go
index d50f3c1c492f..3489e508855a 100644
--- a/test/extended/util/annotate/rules.go
+++ b/test/extended/util/annotate/rules.go
@@ -92,9 +92,6 @@ var (
 
 			// A fix is in progress: https://github.com/openshift/origin/pull/24709
 			`Multi-AZ Clusters should spread the pods of a replication controller across zones`,
-
-			// Workloads: https://bugzilla.redhat.com/show_bug.cgi?id=1731263
-			`SchedulerPreemption`,
 		},
 		// tests that may work, but we don't support them
 		"[Disabled:Unsupported]": {