From 75cddc1269392bcc325efc28cfaae22380ea0484 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Mon, 16 Oct 2023 09:07:06 -0700 Subject: [PATCH] config/v1/types_cluster_version: Drop enum from ClusterVersion status capabilities The cluster-version operator should be the only actor writing to ClusterVersion status. Trust it to pick appropriate values, instead of restricting it with an enum. When updating into new capabilities, the current timeline is: 1. Outgoing CVO accepts the new target, and figures out verified and implicit capabilities. 2. Outoing CVO starts trying to write to status, but because of the new capability not being a part of the outgoing CRD's capability enum, this fails. 3. Outgoing CVO pushes the incoming ClusterVersion CRD in runlevel 0 index 1 [1]. 4. status sync attempts, if any, start working again. 5. Outgoing CVO pushes the incoming CVO Deployment in runlevel 0 index 3 [2]. 6. Deployment controller TERMs the outgoing CVO process. 7. Outgoing CVO wraps up the manifest sycning. 8. Outgoing CVO attempts a final status sync [3]. 9. Outgoing CVO releases the leader lock [4]. 10. Outgoing CVO exists. With the status enum removal in this commit, ClusterVersion status syncing will remain working the whole time, reducing the risk of the outgoing CVO shutting down before it had recorded important information in status. The enum relaxation will also help with rollbacks that drop capabilities. Those currently fail on the same status-sync issue [5]: $ oc -n openshift-cluster-version logs -l k8s-app=cluster-version-operator --tail 50 | grep 'ClusterVersion.config.openshift.io "version" is invalid:' | tail -n1 I1005 01:41:28.165931 1 cvo.go:600] Dropping "openshift-cluster-version/version" out of the queue &{0xc00039eae0 0xc0004142e8}: ClusterVersion.config.openshift.io "version" is invalid: [status.capabilities.knownCapabilities[0]: Unsupported value: "Build": supported values: "openshift-samples", "baremetal", "marketplace", "Console", "Insights", "Storage", "CSISnapshot", "NodeTuning", status.capabilities.knownCapabilities[3]: Unsupported value: "DeploymentConfig": supported values: "openshift-samples", "baremetal", "marketplace", "Console", "Insights", "Storage", "CSISnapshot", "NodeTuning", status.capabilities.knownCapabilities[4]: Unsupported value: "ImageRegistry": supported values: "openshift-samples", "baremetal", "marketplace", "Console", "Insights", "Storage", "CSISnapshot", "NodeTuning", status.capabilities.knownCapabilities[6]: Unsupported value: "MachineAPI": supported values: "openshift-samples", "baremetal", "marketplace", "Console", "Insights", "Storage", "CSISnapshot", "NodeTuning", status.capabilities.enabledCapabilities[0]: Unsupported value: "Build": supported values: "openshift-samples", "baremetal", "marketplace", "Console", "Insights", "Storage", "CSISnapshot", "NodeTuning", status.capabilities.enabledCapabilities[3]: Unsupported value: "DeploymentConfig": supported values: "openshift-samples", "baremetal", "marketplace", "Console", "Insights", "Storage", "CSISnapshot", "NodeTuning", status.capabilities.enabledCapabilities[4]: Unsupported value: "ImageRegistry": supported values: "openshift-samples", "baremetal", "marketplace", "Console", "Insights", "Storage", "CSISnapshot", "NodeTuning", status.capabilities.enabledCapabilities[6]: Unsupported value: "MachineAPI": supported values: "openshift-samples", "baremetal", "marketplace", "Console", "Insights", "Storage", "CSISnapshot", "NodeTuning"] but unlike the roll-forward case, where that is a temporary issue, on rollbacks the additional capabilities never became acceptable again, and the cluster-version operator was permanently blocked from writing to ClusterVersion status. With the status enum removal in this commit, ClusterVersion status syncing will remain working, even though it will list as an enabled capability the capability which had been added during the earlier roll-forward. [1]: https://github.com/openshift/cluster-version-operator/blob/baf7ba7b45852ad2b95e9e498585fbee35111eef/Dockerfile.rhel#L11 [2]: https://github.com/openshift/cluster-version-operator/blob/baf7ba7b45852ad2b95e9e498585fbee35111eef/install/0000_00_cluster-version-operator_03_deployment.yaml [3]: https://github.com/openshift/cluster-version-operator/blob/baf7ba7b45852ad2b95e9e498585fbee35111eef/pkg/cvo/cvo.go#L421-L423 [4]: https://github.com/openshift/cluster-version-operator/blob/baf7ba7b45852ad2b95e9e498585fbee35111eef/pkg/start/start.go#L258-L278 [5]: https://github.com/openshift/release/pull/43984#issuecomment-1747916809 --- ...ersion-operator_01_clusterversion.crd.yaml | 32 ++----------------- config/v1/types_cluster_version.go | 7 ++-- config/v1/zz_generated.deepcopy.go | 4 +-- 3 files changed, 9 insertions(+), 34 deletions(-) diff --git a/config/v1/0000_00_cluster-version-operator_01_clusterversion.crd.yaml b/config/v1/0000_00_cluster-version-operator_01_clusterversion.crd.yaml index 7cf29c2a848..76cdb774085 100644 --- a/config/v1/0000_00_cluster-version-operator_01_clusterversion.crd.yaml +++ b/config/v1/0000_00_cluster-version-operator_01_clusterversion.crd.yaml @@ -187,43 +187,15 @@ spec: description: enabledCapabilities lists all the capabilities that are currently managed. type: array items: - description: ClusterVersionCapability enumerates optional, core cluster components. + description: ClusterVersionStatusCapability enumerates optional, core cluster components, but without ClusterVersionCapability's enum. type: string - enum: - - openshift-samples - - baremetal - - marketplace - - Console - - Insights - - Storage - - CSISnapshot - - NodeTuning - - MachineAPI - - Build - - DeploymentConfig - - ImageRegistry - - OperatorLifecycleManager x-kubernetes-list-type: atomic knownCapabilities: description: knownCapabilities lists all the capabilities known to the current cluster. type: array items: - description: ClusterVersionCapability enumerates optional, core cluster components. + description: ClusterVersionStatusCapability enumerates optional, core cluster components, but without ClusterVersionCapability's enum. type: string - enum: - - openshift-samples - - baremetal - - marketplace - - Console - - Insights - - Storage - - CSISnapshot - - NodeTuning - - MachineAPI - - Build - - DeploymentConfig - - ImageRegistry - - OperatorLifecycleManager x-kubernetes-list-type: atomic conditionalUpdates: description: conditionalUpdates contains the list of updates that may be recommended for this cluster if it meets specific required conditions. Consumers interested in the set of updates that are actually recommended for this cluster should use availableUpdates. This list may be empty if no updates are recommended, if the update service is unavailable, or if an empty or invalid channel has been specified. diff --git a/config/v1/types_cluster_version.go b/config/v1/types_cluster_version.go index a9bade6fe79..1acbed69998 100644 --- a/config/v1/types_cluster_version.go +++ b/config/v1/types_cluster_version.go @@ -252,6 +252,9 @@ const ( // +kubebuilder:validation:Enum=openshift-samples;baremetal;marketplace;Console;Insights;Storage;CSISnapshot;NodeTuning;MachineAPI;Build;DeploymentConfig;ImageRegistry;OperatorLifecycleManager type ClusterVersionCapability string +// ClusterVersionStatusCapability enumerates optional, core cluster components, but without ClusterVersionCapability's enum. +type ClusterVersionStatusCapability string + const ( // ClusterVersionCapabilityOpenShiftSamples manages the sample // image streams and templates stored in the openshift @@ -489,12 +492,12 @@ type ClusterVersionCapabilitiesStatus struct { // enabledCapabilities lists all the capabilities that are currently managed. // +listType=atomic // +optional - EnabledCapabilities []ClusterVersionCapability `json:"enabledCapabilities,omitempty"` + EnabledCapabilities []ClusterVersionStatusCapability `json:"enabledCapabilities,omitempty"` // knownCapabilities lists all the capabilities known to the current cluster. // +listType=atomic // +optional - KnownCapabilities []ClusterVersionCapability `json:"knownCapabilities,omitempty"` + KnownCapabilities []ClusterVersionStatusCapability `json:"knownCapabilities,omitempty"` } // ComponentOverride allows overriding cluster version operator's behavior diff --git a/config/v1/zz_generated.deepcopy.go b/config/v1/zz_generated.deepcopy.go index 63b9f050d00..e803fdb50ba 100644 --- a/config/v1/zz_generated.deepcopy.go +++ b/config/v1/zz_generated.deepcopy.go @@ -1110,12 +1110,12 @@ func (in *ClusterVersionCapabilitiesStatus) DeepCopyInto(out *ClusterVersionCapa *out = *in if in.EnabledCapabilities != nil { in, out := &in.EnabledCapabilities, &out.EnabledCapabilities - *out = make([]ClusterVersionCapability, len(*in)) + *out = make([]ClusterVersionStatusCapability, len(*in)) copy(*out, *in) } if in.KnownCapabilities != nil { in, out := &in.KnownCapabilities, &out.KnownCapabilities - *out = make([]ClusterVersionCapability, len(*in)) + *out = make([]ClusterVersionStatusCapability, len(*in)) copy(*out, *in) } return