diff --git a/test/extended/etcd/helpers/helpers.go b/test/extended/etcd/helpers/helpers.go index 269ccf2dff4b..8bba5a0d1597 100644 --- a/test/extended/etcd/helpers/helpers.go +++ b/test/extended/etcd/helpers/helpers.go @@ -3,6 +3,7 @@ package helpers import ( "context" "fmt" + "sort" "strings" "time" @@ -10,8 +11,10 @@ import ( o "github.com/onsi/gomega" configv1 "github.com/openshift/api/config/v1" + machinev1 "github.com/openshift/api/machine/v1" machinev1beta1 "github.com/openshift/api/machine/v1beta1" machineclient "github.com/openshift/client-go/machine/clientset/versioned" + machinev1client "github.com/openshift/client-go/machine/clientset/versioned/typed/machine/v1" machinev1beta1client "github.com/openshift/client-go/machine/clientset/versioned/typed/machine/v1beta1" bmhelper "github.com/openshift/origin/test/extended/baremetal" @@ -180,6 +183,91 @@ func recoverClusterToInitialStateIfNeeded(ctx context.Context, t TestingT, machi }) } +func DeleteSingleMachine(ctx context.Context, t TestingT, machineClient machinev1beta1client.MachineInterface) (string, error) { + waitPollInterval := 15 * time.Second + waitPollTimeout := 5 * time.Minute + t.Logf("Waiting up to %s to delete a machine", waitPollTimeout.String()) + + machineToDelete := "" + err := wait.Poll(waitPollInterval, waitPollTimeout, func() (bool, error) { + machineList, err := machineClient.List(ctx, metav1.ListOptions{LabelSelector: masterMachineLabelSelector}) + if err != nil { + return isTransientAPIError(t, err) + } + + // Machine names are suffixed with an index number (e.g "ci-op-xlbdrkvl-6a467-qcbkh-master-0") + // so we sort to pick the lowest index, e.g master-0 in this example + machineNames := []string{} + for _, m := range machineList.Items { + machineNames = append(machineNames, m.Name) + } + sort.Strings(machineNames) + machineToDelete = machineNames[0] + t.Logf("attempting to delete machine %q", machineToDelete) + + if err := machineClient.Delete(ctx, machineToDelete, metav1.DeleteOptions{}); err != nil { + // The machine we just listed should be present but if not, error out + if apierrors.IsNotFound(err) { + t.Logf("machine %q was listed but not found or already deleted", machineToDelete) + return false, fmt.Errorf("machine %q was listed but not found or already deleted", machineToDelete) + } + return isTransientAPIError(t, err) + } + t.Logf("successfully deleted machine %q", machineToDelete) + + return true, nil + }) + + return machineToDelete, err +} + +// IsCPMSActive returns true if the current platform's has an active CPMS +// Not all platforms are supported (as of 4.12 only AWS and Azure) +// See https://github.com/openshift/cluster-control-plane-machine-set-operator/tree/main/docs/user#supported-platforms +func IsCPMSActive(ctx context.Context, t TestingT, cpmsClient machinev1client.ControlPlaneMachineSetInterface) (bool, error) { + // The CPMS singleton in the "openshift-machine-api" namespace is named "cluster" + // https://github.com/openshift/cluster-control-plane-machine-set-operator/blob/bba395abab62fc12de4a9b9b030700546f4b822e/pkg/controllers/controlplanemachineset/controller.go#L50-L53 + cpms, err := cpmsClient.Get(ctx, "cluster", metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + return false, nil + } + return false, err + } + + // The CPMS state must be active in order for the platform to be supported + // See https://github.com/openshift/cluster-control-plane-machine-set-operator/blob/7961d1457c6aef26d3b1dafae962da2a2aba18ef/docs/user/installation.md#anatomy-of-a-controlplanemachineset + if cpms.Spec.State != machinev1.ControlPlaneMachineSetStateActive { + return false, nil + } + + return true, nil +} + +// EnsureReadyReplicasOnCPMS checks if status.readyReplicas on the cluster CPMS is n +// this effectively counts the number of control-plane machines with the provider state as running +func EnsureReadyReplicasOnCPMS(ctx context.Context, t TestingT, expectedReplicaCount int, cpmsClient machinev1client.ControlPlaneMachineSetInterface) error { + waitPollInterval := 5 * time.Second + waitPollTimeout := 10 * time.Minute + t.Logf("Waiting up to %s for the CPMS to have status.readyReplicas = %v", waitPollTimeout.String(), expectedReplicaCount) + + return wait.Poll(waitPollInterval, waitPollTimeout, func() (bool, error) { + cpms, err := cpmsClient.Get(ctx, "cluster", metav1.GetOptions{}) + if err != nil { + return isTransientAPIError(t, err) + } + + if cpms.Status.ReadyReplicas != int32(expectedReplicaCount) { + t.Logf("expected %d ready replicas on CPMS, got: %v,", expectedReplicaCount, cpms.Status.ReadyReplicas) + return false, nil + } + + t.Logf("CPMS has reached the desired number of ready replicas: %v,", cpms.Status.ReadyReplicas) + + return true, nil + }) +} + // EnsureVotingMembersCount counts the number of voting etcd members, it doesn't evaluate health conditions or any other attributes (i.e. name) of individual members // this method won't fail immediately on errors, this is useful during scaling down operation until the feature can ensure this operation to be graceful func EnsureVotingMembersCount(ctx context.Context, t TestingT, etcdClientFactory EtcdClientCreator, kubeClient kubernetes.Interface, expectedMembersCount int) error { diff --git a/test/extended/etcd/vertical_scaling.go b/test/extended/etcd/vertical_scaling.go index 0666597066eb..da83d2c5dc42 100644 --- a/test/extended/etcd/vertical_scaling.go +++ b/test/extended/etcd/vertical_scaling.go @@ -44,6 +44,7 @@ var _ = g.Describe("[sig-etcd][Feature:EtcdVerticalScaling][Suite:openshift/etcd machineClientSet, err := machineclient.NewForConfig(oc.KubeFramework().ClientConfig()) o.Expect(err).ToNot(o.HaveOccurred()) machineClient := machineClientSet.MachineV1beta1().Machines("openshift-machine-api") + cpmsClient := machineClientSet.MachineV1().ControlPlaneMachineSets("openshift-machine-api") kubeClient := oc.KubeClient() // make sure it can be run on the current platform @@ -54,6 +55,66 @@ var _ = g.Describe("[sig-etcd][Feature:EtcdVerticalScaling][Suite:openshift/etcd err = errors.Wrap(err, "pre-test: timed out waiting for initial cluster state to have 3 running machines and 3 voting members") o.Expect(err).ToNot(o.HaveOccurred()) + cpmsActive, err := scalingtestinglibrary.IsCPMSActive(ctx, g.GinkgoT(), cpmsClient) + err = errors.Wrap(err, "pre-test: failed to determine if ControlPlaneMachineSet is present and active") + o.Expect(err).ToNot(o.HaveOccurred()) + + if cpmsActive { + // TODO: Add cleanup step to recover back to 3 running machines and members if the test fails + + framework.Logf("CPMS is active. Relying on CPMSO to replace the machine during vertical scaling") + + // step 1: delete a running machine to trigger the CPMSO to create a new one to replace it + machineName, err := scalingtestinglibrary.DeleteSingleMachine(ctx, g.GinkgoT(), machineClient) + o.Expect(err).ToNot(o.HaveOccurred()) + framework.Logf("Waiting for machine %q pending deletion to be replaced", machineName) + + memberName, err := scalingtestinglibrary.MachineNameToEtcdMemberName(ctx, oc.KubeClient(), machineClient, machineName) + err = errors.Wrapf(err, "failed to get etcd member name for deleted machine: %v", machineName) + o.Expect(err).ToNot(o.HaveOccurred()) + + // step 2: wait until the CPMSO scales-up by creating a new machine + // We need to check the cpms' status.readyReplicas because the phase of one machine will always be Deleting + // so we can't use EnsureMasterMachinesAndCount() since that counts for machines that aren't pending deletion + err = scalingtestinglibrary.EnsureReadyReplicasOnCPMS(ctx, g.GinkgoT(), 4, cpmsClient) + err = errors.Wrap(err, "scale-up: timed out waiting for CPMS to show 4 ready replicas") + o.Expect(err).ToNot(o.HaveOccurred()) + + // We can't check for 4 members here as the clustermemberremoval controller will race to + // remove the old member (from the machine pending deletion) as soon as the new machine's member + // is promoted to a voting member. + // Instead we just wait until the CPMS shows 3 replicas again which indicates that the new member was added + // successfully + + // step 3: wait for automatic scale-down as the replica count goes back down to 3 + err = scalingtestinglibrary.EnsureReadyReplicasOnCPMS(ctx, g.GinkgoT(), 3, cpmsClient) + err = errors.Wrap(err, "scale-down: timed out waiting for CPMS to show 3 ready replicas") + o.Expect(err).ToNot(o.HaveOccurred()) + + err = scalingtestinglibrary.EnsureVotingMembersCount(ctx, g.GinkgoT(), etcdClientFactory, kubeClient, 3) + err = errors.Wrap(err, "scale-down: timed out waiting for 3 voting members in the etcd cluster and etcd-endpoints configmap") + o.Expect(err).ToNot(o.HaveOccurred()) + + err = scalingtestinglibrary.EnsureMemberRemoved(g.GinkgoT(), etcdClientFactory, memberName) + err = errors.Wrapf(err, "scale-down: timed out waiting for member (%v) to be removed", memberName) + o.Expect(err).ToNot(o.HaveOccurred()) + + err = scalingtestinglibrary.EnsureMasterMachinesAndCount(ctx, g.GinkgoT(), machineClient) + err = errors.Wrap(err, "scale-down: timed out waiting for only 3 Running master machines") + o.Expect(err).ToNot(o.HaveOccurred()) + + // step 4: Wait for apiserver revision rollout to stabilize + g.GinkgoT().Log("waiting for api servers to stabilize on the same revision") + err = testlibraryapi.WaitForAPIServerToStabilizeOnTheSameRevision(g.GinkgoT(), oc.KubeClient().CoreV1().Pods("openshift-kube-apiserver")) + err = errors.Wrap(err, "scale-up: timed out waiting for APIServer pods to stabilize on the same revision") + o.Expect(err).ToNot(o.HaveOccurred()) + + return + } + + // For a non-CPMS supported platform the test resorts to manually creating and deleting a machine + framework.Logf("CPMS is inactive. The test will manually add and remove a machine for vertical scaling") + // step 0: ensure clean state after the test defer func() { // since the deletion triggers a new rollout