From 79742d717c8e4dce3b5bf37ed5131b3f931f4359 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Fri, 4 Feb 2022 23:21:25 -0800 Subject: [PATCH 1/2] pkg/featurechangestopper: Seed queue to guard against incorrect startingTechPreviewState As described in [1], some 4.10 jobs that set TechPreviewNoUpgrade very early during install are running into trouble like: 1. Early in bootstrap, something sets TechPreviewNoUpgrade. 2. Cluster-version operator comes up, and attempts to figure out the current featureSet. But because the Kubernetes API is also still coming up, that fails on an error like [2]: $ curl -s https://gcsweb-ci.apps.ci.l2s4.p1.openshiftapps.com/gcs/origin-ci-test/logs/periodic-ci-openshift-release-master-ci-4.10-e2e-aws-techpreview/1489537240471179264/artifacts/e2e-aws-techpreview/gather-extra/artifacts/pods/openshift-cluster-version_cluster-version-operator-76cd65b7bb-4p945_cluster-version-operator.log | grep 'Error getting featuregate value\|tech' W0204 10:10:40.126809 1 start.go:142] Error getting featuregate value: Get "https://127.0.0.1:6443/apis/config.openshift.io/v1/featuregates/cluster": dial tcp 127.0.0.1:6443: connect: connection refused I0204 10:19:53.097129 1 techpreviewchangestopper.go:97] Starting stop-on-techpreview-change controller with TechPreviewNoUpgrade false. 3. The TechPreviewChangeStopper waits for any FeatureGate changes, but we don't get any. 4. CVO happily spends hours without synchronizing any of the requested TechPreviewNoUpgrade manifests. Step 2 was originally fatal, but I'd softened it in 90b14542ae (pkg/start: Log and continue when we fail to retrieve the feature gate, 2021-12-06, #706). Here are the relevant cases, and how they'd behave with the different approaches: a. No Kube-API hiccup on the initial FeatureGate fetch. All implementations handle this well. b. Kube-API hiccup on the initial FeatureGate fetch. i. And the actual FeatureGate value was not TechPreviewNoUpgrade. Before 90b14542ae, this would have caused a useless CVO container restart. Since 90b14542ae, and unchanged in this commit, the CVO container's default: includeTechPreview := false is correct, and we correctly ignore the hiccup. ii. The actual FeatureGate value was TechPreviewNoUpgrade. Before 90b14542ae, this would cause a useful CVO container restart. From 90b14542ae until this commit, we'd hit the bug case where we'd go an unbounded amount of time failing to reconcile the TechPreviewNoUpgrade manifests the user was asking for. With this commit, we notice the divergence right after the informer caches sync, and restart the CVO container. [1]: https://bugzilla.redhat.com/show_bug.cgi?id=2050946#c0 [2]: https://prow.ci.openshift.org/view/gs/origin-ci-test/logs/periodic-ci-openshift-release-master-ci-4.10-e2e-aws-techpreview/1489537240471179264 --- pkg/featurechangestopper/techpreviewchangestopper.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/featurechangestopper/techpreviewchangestopper.go b/pkg/featurechangestopper/techpreviewchangestopper.go index 4c1e73f704..2f3fb618c0 100644 --- a/pkg/featurechangestopper/techpreviewchangestopper.go +++ b/pkg/featurechangestopper/techpreviewchangestopper.go @@ -40,6 +40,7 @@ func New( queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "feature-gate-stopper"), } + c.queue.Add("cluster") // seed an initial sync, in case startingTechPreviewState is wrong featureGateInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { c.queue.Add("cluster") From a44a3c045fc81990994ed44a8159e942f257aec1 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Sat, 5 Feb 2022 19:11:47 -0800 Subject: [PATCH 2/2] pkg/featurechangestopper: Avoid looping on not-found Not Found errors are a clear configuration for "no feature set", so syncHandler should treat them as successful results. This avoids continually requeuing for a new GET call, now that 79742d717c (pkg/featurechangestopper: Seed queue to guard against incorrect startingTechPreviewState, 2022-02-04, #736) is seeding the queue, even in clusters where there is no FeatureSet clusters. And it also allows us to detect set -> removed transitions if those were allowed, although the API-server is supposed to make it impossible to remove TechPreviewNoUpgrade once it has been set [1]. [1]: https://docs.openshift.com/container-platform/4.9/nodes/clusters/nodes-cluster-enabling-features.html#nodes-cluster-enabling-features-about_nodes-cluster-enabling --- pkg/featurechangestopper/techpreviewchangestopper.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pkg/featurechangestopper/techpreviewchangestopper.go b/pkg/featurechangestopper/techpreviewchangestopper.go index 2f3fb618c0..89942dd478 100644 --- a/pkg/featurechangestopper/techpreviewchangestopper.go +++ b/pkg/featurechangestopper/techpreviewchangestopper.go @@ -9,6 +9,7 @@ import ( configv1 "github.com/openshift/api/config/v1" configinformersv1 "github.com/openshift/client-go/config/informers/externalversions/config/v1" configlistersv1 "github.com/openshift/client-go/config/listers/config/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/tools/cache" @@ -60,11 +61,13 @@ func New( // processNextWorkItem caller handling the queue management. It returns // done when there will be no more work (because the feature gate changed). func (c *TechPreviewChangeStopper) syncHandler(ctx context.Context) (done bool, err error) { - featureGates, err := c.featureGateLister.Get("cluster") - if err != nil { + var current configv1.FeatureSet + if featureGates, err := c.featureGateLister.Get("cluster"); err == nil { + current = featureGates.Spec.FeatureSet + } else if !apierrors.IsNotFound(err) { return false, err } - techPreviewNowSet := featureGates.Spec.FeatureSet == configv1.TechPreviewNoUpgrade + techPreviewNowSet := current == configv1.TechPreviewNoUpgrade if techPreviewNowSet != c.startingTechPreviewState { var action string if c.shutdownFn == nil { @@ -72,7 +75,7 @@ func (c *TechPreviewChangeStopper) syncHandler(ctx context.Context) (done bool, } else { action = "requesting shutdown" } - klog.Infof("TechPreviewNoUpgrade was %t, but the current feature set is %q; %s.", c.startingTechPreviewState, featureGates.Spec.FeatureSet, action) + klog.Infof("TechPreviewNoUpgrade was %t, but the current feature set is %q; %s.", c.startingTechPreviewState, current, action) if c.shutdownFn != nil { c.shutdownFn() }