From 45e35584cfec097a4d23381129e13851ee3a60be Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Wed, 13 May 2020 22:12:23 -0700 Subject: [PATCH] pkg/daemon: Set AddFunc on the nodeInformer as well AddFunc docs in [1,2]. We've been setting an UpdateFunc on this informer since we pivoted to node informers in d67e633b7c (daemon: use informers to check for updates, 2018-10-26, #130). But we need to listen for add events to catch situations where the node's desiredConfig is altered before the informer's initial List call. This avoids situtations like [3,4]: I0514 03:27:54.773485 4242 update.go:1346] Starting to manage node: ci-op-9gpgc-w-d-hdjg2.c.openshift-gce-devel-ci.internal ... I0514 03:42:37.392240 4242 request.go:1068] Response Body: {"kind":"Node","apiVersion":"v1","metadata":{"name":"ci-op-9gpgc-w-d-hdjg2.c.openshift-gce-devel-ci.internal"..."machineconfiguration.openshift.io/desiredConfig":"rendered-node-log-level-14137156-e501-4d3b-9a93-7fff577b45e5-a048da8ad1c08dce6dc6aa3c1101fb44"... ... I0514 03:43:18.111048 1897 update.go:1346] Starting to manage node: ci-op-9gpgc-w-d-hdjg2.c.openshift-gce-devel-ci.internal ... I0514 03:43:21.844234 1897 request.go:1068] Response Body: {"kind":"NodeList","apiVersion":"v1","metadata":{"selfLink":"/api/v1/nodes","resourceVersion":"28107"},"items":[{"metadata":{"name":"ci-op-9gpgc-m-0.c.openshift-gce-devel-ci.internal"..."machineconfiguration.openshift.io/currentConfig":"rendered-master-9f55fba0ad22a8069d04b6ddf87b8ed9","machineconfiguration.openshift.io/desiredConfig":"rendered-master-9f55fba0ad22a8069d04b6ddf87b8ed9"... ... I0514 03:48:15.568762 1897 daemon.go:1119] Updating Node ci-op-9gpgc-w-d-hdjg2.c.openshift-gce-devel-ci.internal ... I0514 03:48:16.569041 1897 daemon.go:368] Started syncing node "ci-op-9gpgc-w-d-hdjg2.c.openshift-gce-devel-ci.internal" (2020-05-14 03:48:16.569022688 +0> I0514 03:48:16.590279 1897 daemon.go:767] Current config: rendered-worker-a048da8ad1c08dce6dc6aa3c1101fb44 I0514 03:48:16.590302 1897 daemon.go:768] Desired config: rendered-node-log-level-14137156-e501-4d3b-9a93-7fff577b45e5-a048da8ad1c08dce6dc6aa3c1101fb44 That was: 1. Somebody sets desiredConfig to rendered-node-log-level-... while the old MCD (PID 4242) is still running. 2. New MCD (PID 1897) comes up, and fires up its Node informer. 3. Informer does a List while starting up, and sees the node with desiredConfig set to rendered-node-log-level-... Goes to call AddFunc, but until this commit we weren't setting it, so MCD does nothing. 4. Informer sets up its Watch. 5. Time passes... 6. Something else changes the managed Node (kubelet heartbeat? Who cares?), Watch trips, UpdateFunc called. 7. MCD notices the desiredConfig and begins applying rendered-node-log-level-... With this commit, we'll react at step 3 instead of 7, hopefully fixing [5]. [1]: https://pkg.go.dev/k8s.io/client-go/tools/cache?tab=doc#ResourceEventHandlerFuncs [2]: https://pkg.go.dev/k8s.io/client-go/tools/cache?tab=doc#ResourceEventHandlerFuncs.OnAdd [3]: https://prow.svc.ci.openshift.org/view/gcs/origin-ci-test/pr-logs/pull/openshift_machine-config-operator/1730/pull-ci-openshift-machine-config-operator-master-e2e-gcp-op/2250 [4]: https://storage.googleapis.com/origin-ci-test/pr-logs/pull/openshift_machine-config-operator/1730/pull-ci-openshift-machine-config-operator-master-e2e-gcp-op/2250/artifacts/e2e-gcp-op/pods/openshift-machine-config-operator_machine-config-daemon-pq5wv_machine-config-daemon.log [5]: https://bugzilla.redhat.com/show_bug.cgi?id=1834895 --- pkg/daemon/daemon.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 15f9578e4b..8bc360a4be 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -97,7 +97,7 @@ type Daemon struct { // channel used to ensure all spawned goroutines exit when we exit. stopCh <-chan struct{} - // node is the current instance of the node being processed through handleNodeUpdate + // node is the current instance of the node being processed through handleNodeEvent // or the very first instance grabbed when the daemon starts node *corev1.Node @@ -278,7 +278,8 @@ func (dn *Daemon) ClusterConnect( go dn.runLoginMonitor(dn.stopCh, dn.exitCh) nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ - UpdateFunc: dn.handleNodeUpdate, + AddFunc: dn.handleNodeEvent, + UpdateFunc: func(oldObj, newObj interface{}) { dn.handleNodeEvent(newObj) }, }) dn.nodeLister = nodeInformer.Lister() dn.nodeListerSynced = nodeInformer.Informer().HasSynced @@ -1112,12 +1113,11 @@ func (dn *Daemon) runOnceFromIgnition(ignConfig igntypes.Config) error { return dn.reboot("runOnceFromIgnition complete") } -func (dn *Daemon) handleNodeUpdate(old, cur interface{}) { - oldNode := old.(*corev1.Node) - curNode := cur.(*corev1.Node) +func (dn *Daemon) handleNodeEvent(node interface{}) { + n := node.(*corev1.Node) - glog.V(4).Infof("Updating Node %s", oldNode.Name) - dn.enqueueNode(curNode) + glog.V(4).Infof("Updating Node %s", n.Name) + dn.enqueueNode(n) } // prepUpdateFromCluster handles the shared update prepping functionality for