From b05b30ee316eeb2131a6e367c520237018756e8b Mon Sep 17 00:00:00 2001
From: Haseeb Tariq <hasbro17@gmail.com>
Date: Wed, 30 Nov 2022 22:47:39 -0800
Subject: [PATCH] Update etcd scaling test for CPMS supported platforms

For platforms where the ControlPlaneMachineSet is active and
being reconciled by the CPMSO, the vertical scaling test should rely on
the CPMSO to remove and add new machines, otherwise there is a race between
the test removing a machine and the CPMSO adding a new one.
---
 test/extended/etcd/helpers/helpers.go  | 88 ++++++++++++++++++++++++++
 test/extended/etcd/vertical_scaling.go | 61 ++++++++++++++++++
 2 files changed, 149 insertions(+)

diff --git a/test/extended/etcd/helpers/helpers.go b/test/extended/etcd/helpers/helpers.go
index 269ccf2dff4b..8bba5a0d1597 100644
--- a/test/extended/etcd/helpers/helpers.go
+++ b/test/extended/etcd/helpers/helpers.go
@@ -3,6 +3,7 @@ package helpers
 import (
 	"context"
 	"fmt"
+	"sort"
 	"strings"
 	"time"
 
@@ -10,8 +11,10 @@ import (
 	o "github.com/onsi/gomega"
 
 	configv1 "github.com/openshift/api/config/v1"
+	machinev1 "github.com/openshift/api/machine/v1"
 	machinev1beta1 "github.com/openshift/api/machine/v1beta1"
 	machineclient "github.com/openshift/client-go/machine/clientset/versioned"
+	machinev1client "github.com/openshift/client-go/machine/clientset/versioned/typed/machine/v1"
 	machinev1beta1client "github.com/openshift/client-go/machine/clientset/versioned/typed/machine/v1beta1"
 
 	bmhelper "github.com/openshift/origin/test/extended/baremetal"
@@ -180,6 +183,91 @@ func recoverClusterToInitialStateIfNeeded(ctx context.Context, t TestingT, machi
 	})
 }
 
+func DeleteSingleMachine(ctx context.Context, t TestingT, machineClient machinev1beta1client.MachineInterface) (string, error) {
+	waitPollInterval := 15 * time.Second
+	waitPollTimeout := 5 * time.Minute
+	t.Logf("Waiting up to %s to delete a machine", waitPollTimeout.String())
+
+	machineToDelete := ""
+	err := wait.Poll(waitPollInterval, waitPollTimeout, func() (bool, error) {
+		machineList, err := machineClient.List(ctx, metav1.ListOptions{LabelSelector: masterMachineLabelSelector})
+		if err != nil {
+			return isTransientAPIError(t, err)
+		}
+
+		// Machine names are suffixed with an index number (e.g "ci-op-xlbdrkvl-6a467-qcbkh-master-0")
+		// so we sort to pick the lowest index, e.g master-0 in this example
+		machineNames := []string{}
+		for _, m := range machineList.Items {
+			machineNames = append(machineNames, m.Name)
+		}
+		sort.Strings(machineNames)
+		machineToDelete = machineNames[0]
+		t.Logf("attempting to delete machine %q", machineToDelete)
+
+		if err := machineClient.Delete(ctx, machineToDelete, metav1.DeleteOptions{}); err != nil {
+			// The machine we just listed should be present but if not, error out
+			if apierrors.IsNotFound(err) {
+				t.Logf("machine %q was listed but not found or already deleted", machineToDelete)
+				return false, fmt.Errorf("machine %q was listed but not found or already deleted", machineToDelete)
+			}
+			return isTransientAPIError(t, err)
+		}
+		t.Logf("successfully deleted machine %q", machineToDelete)
+
+		return true, nil
+	})
+
+	return machineToDelete, err
+}
+
+// IsCPMSActive returns true if the current platform's has an active CPMS
+// Not all platforms are supported (as of 4.12 only AWS and Azure)
+// See https://github.com/openshift/cluster-control-plane-machine-set-operator/tree/main/docs/user#supported-platforms
+func IsCPMSActive(ctx context.Context, t TestingT, cpmsClient machinev1client.ControlPlaneMachineSetInterface) (bool, error) {
+	// The CPMS singleton in the "openshift-machine-api" namespace is named "cluster"
+	// https://github.com/openshift/cluster-control-plane-machine-set-operator/blob/bba395abab62fc12de4a9b9b030700546f4b822e/pkg/controllers/controlplanemachineset/controller.go#L50-L53
+	cpms, err := cpmsClient.Get(ctx, "cluster", metav1.GetOptions{})
+	if err != nil {
+		if apierrors.IsNotFound(err) {
+			return false, nil
+		}
+		return false, err
+	}
+
+	// The CPMS state must be active in order for the platform to be supported
+	// See https://github.com/openshift/cluster-control-plane-machine-set-operator/blob/7961d1457c6aef26d3b1dafae962da2a2aba18ef/docs/user/installation.md#anatomy-of-a-controlplanemachineset
+	if cpms.Spec.State != machinev1.ControlPlaneMachineSetStateActive {
+		return false, nil
+	}
+
+	return true, nil
+}
+
+// EnsureReadyReplicasOnCPMS checks if status.readyReplicas on the cluster CPMS is n
+// this effectively counts the number of control-plane machines with the provider state as running
+func EnsureReadyReplicasOnCPMS(ctx context.Context, t TestingT, expectedReplicaCount int, cpmsClient machinev1client.ControlPlaneMachineSetInterface) error {
+	waitPollInterval := 5 * time.Second
+	waitPollTimeout := 10 * time.Minute
+	t.Logf("Waiting up to %s for the CPMS to have status.readyReplicas = %v", waitPollTimeout.String(), expectedReplicaCount)
+
+	return wait.Poll(waitPollInterval, waitPollTimeout, func() (bool, error) {
+		cpms, err := cpmsClient.Get(ctx, "cluster", metav1.GetOptions{})
+		if err != nil {
+			return isTransientAPIError(t, err)
+		}
+
+		if cpms.Status.ReadyReplicas != int32(expectedReplicaCount) {
+			t.Logf("expected %d ready replicas on CPMS, got: %v,", expectedReplicaCount, cpms.Status.ReadyReplicas)
+			return false, nil
+		}
+
+		t.Logf("CPMS has reached the desired number of ready replicas: %v,", cpms.Status.ReadyReplicas)
+
+		return true, nil
+	})
+}
+
 // EnsureVotingMembersCount counts the number of voting etcd members, it doesn't evaluate health conditions or any other attributes (i.e. name) of individual members
 // this method won't fail immediately on errors, this is useful during scaling down operation until the feature can ensure this operation to be graceful
 func EnsureVotingMembersCount(ctx context.Context, t TestingT, etcdClientFactory EtcdClientCreator, kubeClient kubernetes.Interface, expectedMembersCount int) error {
diff --git a/test/extended/etcd/vertical_scaling.go b/test/extended/etcd/vertical_scaling.go
index 0666597066eb..da83d2c5dc42 100644
--- a/test/extended/etcd/vertical_scaling.go
+++ b/test/extended/etcd/vertical_scaling.go
@@ -44,6 +44,7 @@ var _ = g.Describe("[sig-etcd][Feature:EtcdVerticalScaling][Suite:openshift/etcd
 		machineClientSet, err := machineclient.NewForConfig(oc.KubeFramework().ClientConfig())
 		o.Expect(err).ToNot(o.HaveOccurred())
 		machineClient := machineClientSet.MachineV1beta1().Machines("openshift-machine-api")
+		cpmsClient := machineClientSet.MachineV1().ControlPlaneMachineSets("openshift-machine-api")
 		kubeClient := oc.KubeClient()
 
 		// make sure it can be run on the current platform
@@ -54,6 +55,66 @@ var _ = g.Describe("[sig-etcd][Feature:EtcdVerticalScaling][Suite:openshift/etcd
 		err = errors.Wrap(err, "pre-test: timed out waiting for initial cluster state to have 3 running machines and 3 voting members")
 		o.Expect(err).ToNot(o.HaveOccurred())
 
+		cpmsActive, err := scalingtestinglibrary.IsCPMSActive(ctx, g.GinkgoT(), cpmsClient)
+		err = errors.Wrap(err, "pre-test: failed to determine if ControlPlaneMachineSet is present and active")
+		o.Expect(err).ToNot(o.HaveOccurred())
+
+		if cpmsActive {
+			// TODO: Add cleanup step to recover back to 3 running machines and members if the test fails
+
+			framework.Logf("CPMS is active. Relying on CPMSO to replace the machine during vertical scaling")
+
+			// step 1: delete a running machine to trigger the CPMSO to create a new one to replace it
+			machineName, err := scalingtestinglibrary.DeleteSingleMachine(ctx, g.GinkgoT(), machineClient)
+			o.Expect(err).ToNot(o.HaveOccurred())
+			framework.Logf("Waiting for machine %q pending deletion to be replaced", machineName)
+
+			memberName, err := scalingtestinglibrary.MachineNameToEtcdMemberName(ctx, oc.KubeClient(), machineClient, machineName)
+			err = errors.Wrapf(err, "failed to get etcd member name for deleted machine: %v", machineName)
+			o.Expect(err).ToNot(o.HaveOccurred())
+
+			// step 2: wait until the CPMSO scales-up by creating a new machine
+			// We need to check the cpms' status.readyReplicas because the phase of one machine will always be Deleting
+			// so we can't use EnsureMasterMachinesAndCount() since that counts for machines that aren't pending deletion
+			err = scalingtestinglibrary.EnsureReadyReplicasOnCPMS(ctx, g.GinkgoT(), 4, cpmsClient)
+			err = errors.Wrap(err, "scale-up: timed out waiting for CPMS to show 4 ready replicas")
+			o.Expect(err).ToNot(o.HaveOccurred())
+
+			// We can't check for 4 members here as the clustermemberremoval controller will race to
+			// remove the old member (from the machine pending deletion) as soon as the new machine's member
+			// is promoted to a voting member.
+			// Instead we just wait until the CPMS shows 3 replicas again which indicates that the new member was added
+			// successfully
+
+			// step 3: wait for automatic scale-down as the replica count goes back down to 3
+			err = scalingtestinglibrary.EnsureReadyReplicasOnCPMS(ctx, g.GinkgoT(), 3, cpmsClient)
+			err = errors.Wrap(err, "scale-down: timed out waiting for CPMS to show 3 ready replicas")
+			o.Expect(err).ToNot(o.HaveOccurred())
+
+			err = scalingtestinglibrary.EnsureVotingMembersCount(ctx, g.GinkgoT(), etcdClientFactory, kubeClient, 3)
+			err = errors.Wrap(err, "scale-down: timed out waiting for 3 voting members in the etcd cluster and etcd-endpoints configmap")
+			o.Expect(err).ToNot(o.HaveOccurred())
+
+			err = scalingtestinglibrary.EnsureMemberRemoved(g.GinkgoT(), etcdClientFactory, memberName)
+			err = errors.Wrapf(err, "scale-down: timed out waiting for member (%v) to be removed", memberName)
+			o.Expect(err).ToNot(o.HaveOccurred())
+
+			err = scalingtestinglibrary.EnsureMasterMachinesAndCount(ctx, g.GinkgoT(), machineClient)
+			err = errors.Wrap(err, "scale-down: timed out waiting for only 3 Running master machines")
+			o.Expect(err).ToNot(o.HaveOccurred())
+
+			// step 4: Wait for apiserver revision rollout to stabilize
+			g.GinkgoT().Log("waiting for api servers to stabilize on the same revision")
+			err = testlibraryapi.WaitForAPIServerToStabilizeOnTheSameRevision(g.GinkgoT(), oc.KubeClient().CoreV1().Pods("openshift-kube-apiserver"))
+			err = errors.Wrap(err, "scale-up: timed out waiting for APIServer pods to stabilize on the same revision")
+			o.Expect(err).ToNot(o.HaveOccurred())
+
+			return
+		}
+
+		// For a non-CPMS supported platform the test resorts to manually creating and deleting a machine
+		framework.Logf("CPMS is inactive. The test will manually add and remove a machine for vertical scaling")
+
 		// step 0: ensure clean state after the test
 		defer func() {
 			// since the deletion triggers a new rollout