Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pkg/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,10 @@ const (
// in the form "[MAJOR].[MINOR].[PATCH]".
VersionMajorMinorPatchLabel = "hive.openshift.io/version-major-minor-patch"

// MinorVersionUpgradeUnavailable is a label applied to ClusterDeployments to show concerns about upgrades to the
// next minor version. i.e 4.y -> 4.(y+1)
MinorVersionUpgradeUnavailable = "hive.openshift.io/minor-version-upgrade-unavailable"

// OvirtCredentialsDir is the directory containing Ovirt credentials files.
OvirtCredentialsDir = "/.ovirt"

Expand Down
17 changes: 17 additions & 0 deletions pkg/controller/clusterversion/clusterversion_controller.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package clusterversion
Copy link
Copy Markdown
Member

@wking wking Mar 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't want the main comment-stream to get too noisy with the latency issue, so pulling this:

I also want to point out that this controller is by default only Watch()ing ClusterDeployment (on the hub), which means there may be a nontrivial delay between when the spoke ClusterVersion object changes and when the controller runs to update the hub CD labels.

out to this random, unrelated line of code to give it a dedicated thread.

I'm personally not concerned about the latency here, because most of the issues that Upgradeable complains about are long-running, slow-changing issues (like "you're on SDN; migrate to OVN to access 4.17"). So ~hours stale is likely to be still accurate in many cases.

And when we miss with a false negative (stale ClusterDeployment data said the update was ok, but turned out ClusterVersion had moved to be Upgradeable=False), it's not terrible. The user could request an update, and the cluster-version operator would reject the request with whatever the Upgradeable=False message was. So the cluster is still safe, it's just a bit more of a rug-pull UX. Having fresher data would improve the UX, but would not increase cluster safety.

When we miss with a false positive (stale ClusterDeployment data said the update was blocked, but turned out ClusterVersion had moved to be Upgradeable=True or unset the Upgradeable condition), it's not terrible either. The user's access to the next 4.(y+1) is delayed by an hour or two until the ClusterDeployment catches up. But it's just a feature update, patch updates pulling in bugfixes and CVEs would not be impacted. And users who want to avoid any risk of delay could just get their Upgradeable ducks lined up more than an hour before they were hoping to launch the update (hopefully nobody is actually trying to cut it that close).

The gap here is flappy issues like PoolUpdating, which I dropped in 4.19 (openshift/machine-config-operator#4760), and I'm happy to backport that (and fixes to any other flappy Upgradeable conditions, although I can't think of any offhand) to older 4.y, if stale ClusterDeployment UX impacts turn out to be an issue.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool. Note that a dummy CD update (like an annotation) could be used to force a resync. That's a thing you can't do through OCM today (that I know of), but would be trivial to implement.


import (
"cmp"
"context"
"fmt"
"math/rand"
Expand Down Expand Up @@ -211,6 +212,22 @@ func (r *ReconcileClusterVersion) updateClusterVersionLabels(cd *hivev1.ClusterD
changed = changed || origLen != len(cd.Labels)
}

upgradeableCondition := ""
for _, condition := range clusterVersion.Status.Conditions {
if condition.Type == openshiftapiv1.OperatorUpgradeable {
if condition.Status != openshiftapiv1.ConditionTrue {
upgradeableCondition = cmp.Or(condition.Message, fmt.Sprintf("%s: %s", condition.Type, condition.Status))
}
break
}
}
changed = changed || upgradeableCondition != cd.Labels[constants.MinorVersionUpgradeUnavailable]
if upgradeableCondition == "" {
delete(cd.Labels, constants.MinorVersionUpgradeUnavailable)
} else {
cd.Labels[constants.MinorVersionUpgradeUnavailable] = upgradeableCondition
}

if !changed {
cdLog.Debug("labels have not changed, nothing to update")
return nil
Expand Down
114 changes: 104 additions & 10 deletions pkg/controller/clusterversion/clusterversion_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,12 @@ func init() {
func TestClusterVersionReconcile(t *testing.T) {

tests := []struct {
name string
existing []runtime.Object
noRemoteCall bool
expectError bool
validate func(*testing.T, *hivev1.ClusterDeployment)
name string
existing []runtime.Object
clusterVersionStatus configv1.ClusterVersionStatus
noRemoteCall bool
expectError bool
validate func(*testing.T, *hivev1.ClusterDeployment)
}{
{
// no cluster deployment, no error expected
Expand All @@ -69,13 +70,105 @@ func TestClusterVersionReconcile(t *testing.T) {
testClusterDeployment(),
testKubeconfigSecret(),
},
clusterVersionStatus: testRemoteClusterVersionStatus(),
validate: func(t *testing.T, cd *hivev1.ClusterDeployment) {
assert.Equal(t, "2.3.4+somebuild", cd.Labels[constants.VersionLabel], "unexpected version label")
assert.Equal(t, "2", cd.Labels[constants.VersionMajorLabel], "unexpected version major label")
assert.Equal(t, "2.3", cd.Labels[constants.VersionMajorMinorLabel], "unexpected version major-minor label")
assert.Equal(t, "2.3.4", cd.Labels[constants.VersionMajorMinorPatchLabel], "unexpected version major-minor-patch label")
},
},
{
name: "upgradeable condition true",
existing: []runtime.Object{
testClusterDeployment(),
testKubeconfigSecret(),
},
clusterVersionStatus: testRemoteClusterVersionStatus(configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorUpgradeable,
Status: configv1.ConditionTrue,
}),
validate: func(t *testing.T, cd *hivev1.ClusterDeployment) {
assert.Equal(t, "", cd.Labels[constants.MinorVersionUpgradeUnavailable], "unexpected version major-minor-patch label")
},
},
{
name: "upgradeable condition false",
existing: []runtime.Object{
testClusterDeployment(),
testKubeconfigSecret(),
},
clusterVersionStatus: testRemoteClusterVersionStatus(configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorUpgradeable,
Status: configv1.ConditionFalse,
}),
validate: func(t *testing.T, cd *hivev1.ClusterDeployment) {
assert.Equal(t, "Upgradeable: False", cd.Labels[constants.MinorVersionUpgradeUnavailable], "unexpected version major-minor-patch label")
},
},
{
name: "upgradeable condition false with message",
existing: []runtime.Object{
testClusterDeployment(),
testKubeconfigSecret(),
},
clusterVersionStatus: testRemoteClusterVersionStatus(configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorUpgradeable,
Status: configv1.ConditionFalse,
Message: "Can't do the upgrade",
}),
validate: func(t *testing.T, cd *hivev1.ClusterDeployment) {
assert.Equal(t, "Can't do the upgrade", cd.Labels[constants.MinorVersionUpgradeUnavailable], "unexpected version major-minor-patch label")
},
},
{
name: "upgradeable condition unknown",
existing: []runtime.Object{
testClusterDeployment(),
testKubeconfigSecret(),
},
clusterVersionStatus: testRemoteClusterVersionStatus(configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorUpgradeable,
Status: configv1.ConditionUnknown,
}),
validate: func(t *testing.T, cd *hivev1.ClusterDeployment) {
assert.Equal(t, "Upgradeable: Unknown", cd.Labels[constants.MinorVersionUpgradeUnavailable], "unexpected version major-minor-patch label")
},
},
{
name: "upgradeable condition unknown with message",
existing: []runtime.Object{
testClusterDeployment(),
testKubeconfigSecret(),
},
clusterVersionStatus: testRemoteClusterVersionStatus(configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorUpgradeable,
Status: configv1.ConditionUnknown,
Message: "Can't read status",
}),
validate: func(t *testing.T, cd *hivev1.ClusterDeployment) {
assert.Equal(t, "Can't read status", cd.Labels[constants.MinorVersionUpgradeUnavailable], "unexpected version major-minor-patch label")
},
},
{
name: "upgradeable condition when multiple conditions",
existing: []runtime.Object{
testClusterDeployment(),
testKubeconfigSecret(),
},
clusterVersionStatus: testRemoteClusterVersionStatus(
configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorProgressing,
},
configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorUpgradeable,
Status: configv1.ConditionFalse,
Message: "It can't upgrade",
}),
validate: func(t *testing.T, cd *hivev1.ClusterDeployment) {
assert.Equal(t, "It can't upgrade", cd.Labels[constants.MinorVersionUpgradeUnavailable], "unexpected version major-minor-patch label")
},
},
}

for _, test := range tests {
Expand All @@ -85,7 +178,7 @@ func TestClusterVersionReconcile(t *testing.T) {
mockCtrl := gomock.NewController(t)
mockRemoteClientBuilder := remoteclientmock.NewMockBuilder(mockCtrl)
if !test.noRemoteCall {
mockRemoteClientBuilder.EXPECT().Build().Return(testRemoteClusterAPIClient(), nil)
mockRemoteClientBuilder.EXPECT().Build().Return(testRemoteClusterAPIClient(test.clusterVersionStatus), nil)
}
rcd := &ReconcileClusterVersion{
Client: fakeClient,
Expand Down Expand Up @@ -183,19 +276,19 @@ func testSecret(name, key, value string) *corev1.Secret {
return s
}

func testRemoteClusterAPIClient() client.Client {
func testRemoteClusterAPIClient(status configv1.ClusterVersionStatus) client.Client {
remoteClusterVersion := &configv1.ClusterVersion{
ObjectMeta: metav1.ObjectMeta{
Name: remoteClusterVersionObjectName,
},
}
remoteClusterVersion.Status = *testRemoteClusterVersionStatus()
remoteClusterVersion.Status = status
return testfake.NewFakeClientBuilder().WithRuntimeObjects(remoteClusterVersion).Build()
}

func testRemoteClusterVersionStatus() *configv1.ClusterVersionStatus {
func testRemoteClusterVersionStatus(conditions ...configv1.ClusterOperatorStatusCondition) configv1.ClusterVersionStatus {
zeroTime := metav1.NewTime(time.Unix(0, 0))
return &configv1.ClusterVersionStatus{
return configv1.ClusterVersionStatus{
Desired: configv1.Release{
Version: "2.3.4+somebuild",
},
Expand All @@ -209,5 +302,6 @@ func testRemoteClusterVersionStatus() *configv1.ClusterVersionStatus {
},
ObservedGeneration: 123456789,
VersionHash: "TESTVERSIONHASH",
Conditions: conditions,
}
}