From d41155f8ec0cd0fdefdea7365971195145a7c0b9 Mon Sep 17 00:00:00 2001 From: Antonio Murdaca Date: Thu, 2 May 2019 17:37:08 +0200 Subject: [PATCH 01/11] pkg/controller: do not progress if the pool isn't idle The NodeController shouldn't rely just on what it's syncing at that moment to deduce that a node is unavailable. It may happen that the pool is updating to a rendered-config-A but it's not done, and the code was still going to apply a new rendered-config-B causing more than 1 node at the time to go unschedulable. This patch should fix that. --- pkg/controller/node/node_controller_test.go | 4 ++-- pkg/controller/node/status.go | 3 ++- pkg/controller/node/status_test.go | 10 ++++++++-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pkg/controller/node/node_controller_test.go b/pkg/controller/node/node_controller_test.go index 6a89bd6b76..0eb2aa1516 100644 --- a/pkg/controller/node/node_controller_test.go +++ b/pkg/controller/node/node_controller_test.go @@ -452,13 +452,13 @@ func TestGetCandidateMachines(t *testing.T) { expected: []*corev1.Node{newNodeWithReady("node-3", "v0", "v0", corev1.ConditionTrue)}, }, { //progress on old stuck node - progress: 1, + progress: 0, nodes: []*corev1.Node{ newNodeWithReady("node-0", "v1", "v1", corev1.ConditionTrue), newNodeWithReady("node-1", "v0.1", "v0.2", corev1.ConditionFalse), newNodeWithReady("node-2", "v0", "v0", corev1.ConditionTrue), }, - expected: []*corev1.Node{newNodeWithReady("node-1", "v0.1", "v0.2", corev1.ConditionFalse)}, + expected: []*corev1.Node{}, }} for idx, test := range tests { diff --git a/pkg/controller/node/status.go b/pkg/controller/node/status.go index 75c1647783..daf98c00c8 100644 --- a/pkg/controller/node/status.go +++ b/pkg/controller/node/status.go @@ -177,6 +177,7 @@ func isNodeReady(node *corev1.Node) bool { return true } +// TODO(runcom): drop currentConfig arg func getUnavailableMachines(currentConfig string, nodes []*corev1.Node) []*corev1.Node { var unavail []*corev1.Node for _, node := range nodes { @@ -193,7 +194,7 @@ func getUnavailableMachines(currentConfig string, nodes []*corev1.Node) []*corev } nodeNotReady := !isNodeReady(node) - if dconfig == currentConfig && (dconfig != cconfig || nodeNotReady) { + if dconfig != cconfig || nodeNotReady { unavail = append(unavail, node) glog.V(2).Infof("Node %s unavailable: different configs (desired: %s, current %s) or node not ready %v", node.Name, dconfig, cconfig, nodeNotReady) } diff --git a/pkg/controller/node/status_test.go b/pkg/controller/node/status_test.go index e437ccd1e6..4a588ee293 100644 --- a/pkg/controller/node/status_test.go +++ b/pkg/controller/node/status_test.go @@ -239,7 +239,10 @@ func TestGetUnavailableMachines(t *testing.T) { newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue), }, currentConfig: "v2", - unavail: []*corev1.Node{newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue)}, + unavail: []*corev1.Node{ + newNodeWithReady("node-0", "v0", "v1", corev1.ConditionTrue), + newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue), + }, }, { // 1 node updated, 1 updating, 1 updating but not v2 and is not ready nodes: []*corev1.Node{ @@ -248,7 +251,10 @@ func TestGetUnavailableMachines(t *testing.T) { newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue), }, currentConfig: "v2", - unavail: []*corev1.Node{newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue)}, + unavail: []*corev1.Node{ + newNodeWithReady("node-0", "v0", "v1", corev1.ConditionFalse), + newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue), + }, }, { // 2 node updated, 1 updating nodes: []*corev1.Node{ From 45ebf1c1dc040a87c74357f5fb146c83e36e5967 Mon Sep 17 00:00:00 2001 From: Antonio Murdaca Date: Thu, 2 May 2019 20:03:51 +0200 Subject: [PATCH 02/11] test/e2e: increase timeout on checking nodes --- Makefile | 2 +- test/e2e/mcd_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 644bf6261b..c179cbd798 100644 --- a/Makefile +++ b/Makefile @@ -98,4 +98,4 @@ images.rhel7: $(imc7) # This was copied from https://github.com/openshift/cluster-image-registry-operato test-e2e: - GOCACHE=off go test -timeout 55m -v$${WHAT:+ -run="$$WHAT"} ./test/e2e/ + GOCACHE=off go test -timeout 75m -v$${WHAT:+ -run="$$WHAT"} ./test/e2e/ diff --git a/test/e2e/mcd_test.go b/test/e2e/mcd_test.go index 20de940389..2012bd3f82 100644 --- a/test/e2e/mcd_test.go +++ b/test/e2e/mcd_test.go @@ -121,7 +121,7 @@ func TestMCDeployed(t *testing.T) { } visited := make(map[string]bool) - if err := wait.Poll(2*time.Second, 10*time.Minute, func() (bool, error) { + if err := wait.Poll(2*time.Second, 30*time.Minute, func() (bool, error) { nodes, err := getNodesByRole(cs, "worker") if err != nil { return false, nil From 44abdd51376a0f8e2d50f1f96fb6af11925c35b8 Mon Sep 17 00:00:00 2001 From: Antonio Murdaca Date: Thu, 2 May 2019 21:54:46 +0200 Subject: [PATCH 03/11] pkg/daemon: adjust sigterm handler https://bugzilla.redhat.com/show_bug.cgi?id=1703877 --- cmd/machine-config-daemon/start.go | 2 ++ pkg/daemon/daemon.go | 26 ++++++++++++++++++++- pkg/daemon/update.go | 36 ++++++++---------------------- 3 files changed, 36 insertions(+), 28 deletions(-) diff --git a/cmd/machine-config-daemon/start.go b/cmd/machine-config-daemon/start.go index 69aad30c58..d7596363ae 100644 --- a/cmd/machine-config-daemon/start.go +++ b/cmd/machine-config-daemon/start.go @@ -206,6 +206,8 @@ func runStartCmd(cmd *cobra.Command, args []string) { glog.Info("Starting MachineConfigDaemon") defer glog.Info("Shutting down MachineConfigDaemon") + dn.InstallSignalHandler(stopCh) + if err := dn.Run(stopCh, exitCh); err != nil { glog.Fatalf("Failed to run: %v", err) } diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 14a3dedd11..1471f6b621 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -11,8 +11,10 @@ import ( "net/http" "os" "os/exec" + "os/signal" "path/filepath" "strings" + "syscall" "time" imgref "github.com/containers/image/docker/reference" @@ -88,7 +90,7 @@ type Daemon struct { kubeletHealthzEnabled bool kubeletHealthzEndpoint string - installedSigterm bool + updateActive bool nodeWriter NodeWriter @@ -469,6 +471,28 @@ func (dn *Daemon) runOnceFrom() error { return errors.New("unsupported onceFrom type provided") } +// InstallSignalHandler installs the handler for the signals the daemon should act on +func (dn *Daemon) InstallSignalHandler(stopCh chan struct{}) { + termChan := make(chan os.Signal, 2048) + signal.Notify(termChan, syscall.SIGTERM) + + // Catch SIGTERM - if we're actively updating, we should avoid + // having the process be killed. + // https://github.com/openshift/machine-config-operator/issues/407 + go func() { + for sig := range termChan { + switch sig { + case syscall.SIGTERM: + if dn.updateActive { + glog.Info("Got SIGTERM, but actively updating") + } else { + close(stopCh) + } + } + } + }() +} + // Run finishes informer setup and then blocks, and the informer will be // responsible for triggering callbacks to handle updates. Successful // updates shouldn't return, and should just reboot the node. diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index 6c122182b7..a8d565630b 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -7,12 +7,10 @@ import ( "io" "os" "os/exec" - "os/signal" "os/user" "path/filepath" "reflect" "strconv" - "syscall" "time" ignv2_2types "github.com/coreos/ignition/config/v2_2/types" @@ -154,28 +152,6 @@ func (dn *Daemon) updateOSAndReboot(newConfig *mcfgv1.MachineConfig) (retErr err return dn.reboot(fmt.Sprintf("Node will reboot into config %v", newConfig.GetName()), defaultRebootTimeout, exec.Command(defaultRebootCommand)) } -// isUpdating returns true if the MCD is actively applying an update -func (dn *Daemon) catchIgnoreSIGTERM() { - if dn.installedSigterm { - return - } - - termChan := make(chan os.Signal, 1) - signal.Notify(termChan, syscall.SIGTERM) - - dn.installedSigterm = true - - // Catch SIGTERM - if we're actively updating, we should avoid - // having the process be killed. - // https://github.com/openshift/machine-config-operator/issues/407 - go func() { - for { - <-termChan - glog.Info("Got SIGTERM, but actively updating") - } - }() -} - var errUnreconcilable = errors.New("unreconcilable") // update the node to the provided node configuration. @@ -751,10 +727,16 @@ func (dn *Daemon) logSystem(format string, a ...interface{}) { } } +func (dn *Daemon) catchIgnoreSIGTERM() { + if dn.updateActive { + return + } + dn.updateActive = true +} + func (dn *Daemon) cancelSIGTERM() { - if dn.installedSigterm { - signal.Reset(syscall.SIGTERM) - dn.installedSigterm = false + if dn.updateActive { + dn.updateActive = false } } From dc12aa40fb07cbefc0e85880c00bb6a4b7995b4d Mon Sep 17 00:00:00 2001 From: Antonio Murdaca Date: Fri, 3 May 2019 10:50:39 +0200 Subject: [PATCH 04/11] test/e2e: bump pool's maxUnvailable --- test/e2e/mcd_test.go | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/test/e2e/mcd_test.go b/test/e2e/mcd_test.go index 2012bd3f82..c4c38d8675 100644 --- a/test/e2e/mcd_test.go +++ b/test/e2e/mcd_test.go @@ -1,6 +1,7 @@ package e2e_test import ( + "encoding/json" "fmt" "os/exec" "strings" @@ -11,10 +12,14 @@ import ( mcv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1" "github.com/openshift/machine-config-operator/pkg/daemon/constants" "github.com/openshift/machine-config-operator/test/e2e/framework" + "github.com/stretchr/testify/require" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/util/jsonmergepatch" "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/apimachinery/pkg/util/wait" ) @@ -92,8 +97,10 @@ func createMCToAddFile(name, filename, data, fs string) *mcv1.MachineConfig { func TestMCDeployed(t *testing.T) { cs := framework.NewClientSet("") + bumpPoolMaxUnavailableTo(t, cs, 3) - for i := 0; i < 10; i++ { + // TODO: bring this back to 10 + for i := 0; i < 5; i++ { mcadd := createMCToAddFile("add-a-file", fmt.Sprintf("/etc/mytestconf%d", i), "test", "root") // create the dummy MC now @@ -145,8 +152,24 @@ func TestMCDeployed(t *testing.T) { } } +func bumpPoolMaxUnavailableTo(t *testing.T, cs *framework.ClientSet, max int) { + pool, err := cs.MachineConfigPools().Get("worker", metav1.GetOptions{}) + require.Nil(t, err) + old, err := json.Marshal(pool) + require.Nil(t, err) + maxUnavailable := intstr.FromInt(max) + pool.Spec.MaxUnavailable = &maxUnavailable + new, err := json.Marshal(pool) + require.Nil(t, err) + patch, err := jsonmergepatch.CreateThreeWayJSONMergePatch(old, new, old) + require.Nil(t, err) + _, err = cs.MachineConfigPools().Patch("worker", types.MergePatchType, patch) + require.Nil(t, err) +} + func TestUpdateSSH(t *testing.T) { cs := framework.NewClientSet("") + bumpPoolMaxUnavailableTo(t, cs, 3) // create a dummy MC with an sshKey for user Core mcName := fmt.Sprintf("sshkeys-worker-%s", uuid.NewUUID()) @@ -310,6 +333,7 @@ func TestPoolDegradedOnFailToRender(t *testing.T) { func TestReconcileAfterBadMC(t *testing.T) { cs := framework.NewClientSet("") + bumpPoolMaxUnavailableTo(t, cs, 3) // create a bad MC w/o a filesystem field which is going to fail reconciling mcadd := createMCToAddFile("add-a-file", "/etc/mytestconfs", "test", "") @@ -372,7 +396,7 @@ func TestReconcileAfterBadMC(t *testing.T) { if err != nil { return false, err } - if mcp.Status.UnavailableMachineCount == 1 { + if mcp.Status.UnavailableMachineCount >= 1 { return true, nil } return false, nil @@ -400,7 +424,7 @@ func TestReconcileAfterBadMC(t *testing.T) { } visited := make(map[string]bool) - if err := wait.Poll(2*time.Second, 10*time.Minute, func() (bool, error) { + if err := wait.Poll(2*time.Second, 30*time.Minute, func() (bool, error) { nodes, err := getNodesByRole(cs, "worker") if err != nil { return false, err From 87f4cc96fc9e8744a1a014674a3739ad6ce090e0 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Thu, 2 May 2019 15:07:59 +0000 Subject: [PATCH 05/11] controller: Log when a node has been updated The MCC logs show when we tell a node to start, but don't show us when it's done. This helps the MCC logs be a centralized aggregate of events. Doing this now to aid debugging slowness in updates in our `e2e-aws-op` test suite. --- pkg/controller/node/node_controller.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pkg/controller/node/node_controller.go b/pkg/controller/node/node_controller.go index 04fcd87258..0765634feb 100644 --- a/pkg/controller/node/node_controller.go +++ b/pkg/controller/node/node_controller.go @@ -200,7 +200,14 @@ func (ctrl *Controller) updateNode(old, cur interface{}) { if pool == nil { return } - glog.V(4).Infof("Node %s updated", curNode.Name) + + // Specifically log when a node has completed an update so the MCC logs are a useful central aggregate of state changes + if oldNode.Annotations[daemonconsts.CurrentMachineConfigAnnotationKey] != oldNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] && + curNode.Annotations[daemonconsts.CurrentMachineConfigAnnotationKey] == curNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] { + glog.Infof("Pool %s: node %s has completed update to %s", pool.Name, curNode.Name, curNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey]) + } else { + glog.V(4).Infof("Node %s updated", curNode.Name) + } ctrl.enqueueMachineConfigPool(pool) } From 2a19206dc5c74821bc48ad379f4561cb03ec7a95 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Fri, 3 May 2019 09:05:02 +0000 Subject: [PATCH 06/11] status: Add some doc comments General principle. --- pkg/controller/node/status.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pkg/controller/node/status.go b/pkg/controller/node/status.go index daf98c00c8..66d4667d73 100644 --- a/pkg/controller/node/status.go +++ b/pkg/controller/node/status.go @@ -111,6 +111,8 @@ func calculateStatus(pool *mcfgv1.MachineConfigPool, nodes []*corev1.Node) mcfgv return status } +// getUpdatedMachines filters the provided nodes to ones whose current == desired +// and the "done" flag is set. func getUpdatedMachines(currentConfig string, nodes []*corev1.Node) []*corev1.Node { var updated []*corev1.Node for _, node := range nodes { @@ -138,6 +140,8 @@ func getUpdatedMachines(currentConfig string, nodes []*corev1.Node) []*corev1.No return updated } +// getReadyMachines filters the provided nodes to ones which are updated +// and marked ready. func getReadyMachines(currentConfig string, nodes []*corev1.Node) []*corev1.Node { updated := getUpdatedMachines(currentConfig, nodes) var ready []*corev1.Node @@ -178,6 +182,9 @@ func isNodeReady(node *corev1.Node) bool { } // TODO(runcom): drop currentConfig arg +// getUnavailableMachines is the opposite of getReadyNodes - it filters the provided +// set of nodes to ones which are either not ready, or whose current config does +// not match the target.g func getUnavailableMachines(currentConfig string, nodes []*corev1.Node) []*corev1.Node { var unavail []*corev1.Node for _, node := range nodes { From 60c9243451ba0f155da3a3e7cdc0028a37e972c9 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Fri, 3 May 2019 09:13:28 +0000 Subject: [PATCH 07/11] daemon: Log drain timings to journal too The journal saves our history in a reliable way, let's log more things there too so we can debug e.g. how long things took. Also consistently log to the journal with the `machine-config-daemon: ` prefix (note the colon `:`) so it's easy to do: `journalctl -r |grep machine-config-daemon:` --- pkg/daemon/daemon.go | 2 +- pkg/daemon/update.go | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 1471f6b621..6921f39878 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -1000,7 +1000,7 @@ func (dn *Daemon) completeUpdate(node *corev1.Node, desiredConfigName string) er return err } - dn.logSystem("machine-config-daemon: completed update for config %s", desiredConfigName) + dn.logSystem("completed update for config %s", desiredConfigName) return nil } diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index a8d565630b..4a6aa76d8f 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -116,7 +116,7 @@ func (dn *Daemon) updateOSAndReboot(newConfig *mcfgv1.MachineConfig) (retErr err // Skip draining of the node when we're not cluster driven if dn.onceFrom == "" { - glog.Info("Update prepared; draining the node") + dn.logSystem("Update prepared; beginning drain") dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "Drain", "Draining node to update config.") @@ -145,7 +145,7 @@ func (dn *Daemon) updateOSAndReboot(newConfig *mcfgv1.MachineConfig) (retErr err } return errors.Wrap(err, "failed to drain node") } - glog.Info("Node successfully drained") + dn.logSystem("drain complete") } // reboot. this function shouldn't actually return. @@ -718,6 +718,7 @@ func (dn *Daemon) logSystem(format string, a ...interface{}) { go func() { defer stdin.Close() + io.WriteString(stdin, "machine-config-daemon: ") io.WriteString(stdin, message) }() err = logger.Run() @@ -748,7 +749,7 @@ func (dn *Daemon) reboot(rationale string, timeout time.Duration, rebootCmd *exe if dn.recorder != nil { dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "Reboot", rationale) } - dn.logSystem("machine-config-daemon initiating reboot: %s", rationale) + dn.logSystem("initiating reboot: %s", rationale) // Now that everything is done, avoid delaying shutdown. dn.cancelSIGTERM() From b042fecb3d40ed4707b373f225f350d298560a01 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Fri, 3 May 2019 09:19:06 +0000 Subject: [PATCH 08/11] daemon: Log to journal MCD startup and initating config write For more debugging. --- pkg/daemon/daemon.go | 2 +- pkg/daemon/update.go | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 6921f39878..4412646541 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -268,7 +268,7 @@ func NewClusterDrivenDaemon( eventBroadcaster.StartRecordingToSink(&clientsetcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) dn.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "machineconfigdaemon", Host: nodeName}) - glog.Infof("Managing node: %s", nodeName) + dn.logSystem("Starting to manage node: %s", nodeName) go dn.runLoginMonitor(dn.stopCh, dn.exitCh) diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go index 4a6aa76d8f..f7dfafc4ff 100644 --- a/pkg/daemon/update.go +++ b/pkg/daemon/update.go @@ -194,6 +194,7 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig) (retErr err dn.logSystem(wrappedErr.Error()) return errors.Wrapf(errUnreconcilable, "%v", wrappedErr) } + dn.logSystem("Starting update from %s to %s", oldConfigName, newConfigName) // update files on disk that need updating if err := dn.updateFiles(oldConfig, newConfig); err != nil { From 8b8bef915bb4c2e97aabfd5b67af445455a9a105 Mon Sep 17 00:00:00 2001 From: Antonio Murdaca Date: Fri, 3 May 2019 20:23:44 +0200 Subject: [PATCH 09/11] Revert "pkg/controller: do not progress if the pool isn't idle" This reverts commit d41155f8ec0cd0fdefdea7365971195145a7c0b9. --- pkg/controller/node/node_controller_test.go | 4 ++-- pkg/controller/node/status.go | 6 +----- pkg/controller/node/status_test.go | 10 ++-------- 3 files changed, 5 insertions(+), 15 deletions(-) diff --git a/pkg/controller/node/node_controller_test.go b/pkg/controller/node/node_controller_test.go index 0eb2aa1516..6a89bd6b76 100644 --- a/pkg/controller/node/node_controller_test.go +++ b/pkg/controller/node/node_controller_test.go @@ -452,13 +452,13 @@ func TestGetCandidateMachines(t *testing.T) { expected: []*corev1.Node{newNodeWithReady("node-3", "v0", "v0", corev1.ConditionTrue)}, }, { //progress on old stuck node - progress: 0, + progress: 1, nodes: []*corev1.Node{ newNodeWithReady("node-0", "v1", "v1", corev1.ConditionTrue), newNodeWithReady("node-1", "v0.1", "v0.2", corev1.ConditionFalse), newNodeWithReady("node-2", "v0", "v0", corev1.ConditionTrue), }, - expected: []*corev1.Node{}, + expected: []*corev1.Node{newNodeWithReady("node-1", "v0.1", "v0.2", corev1.ConditionFalse)}, }} for idx, test := range tests { diff --git a/pkg/controller/node/status.go b/pkg/controller/node/status.go index 66d4667d73..4ef5b48d64 100644 --- a/pkg/controller/node/status.go +++ b/pkg/controller/node/status.go @@ -181,10 +181,6 @@ func isNodeReady(node *corev1.Node) bool { return true } -// TODO(runcom): drop currentConfig arg -// getUnavailableMachines is the opposite of getReadyNodes - it filters the provided -// set of nodes to ones which are either not ready, or whose current config does -// not match the target.g func getUnavailableMachines(currentConfig string, nodes []*corev1.Node) []*corev1.Node { var unavail []*corev1.Node for _, node := range nodes { @@ -201,7 +197,7 @@ func getUnavailableMachines(currentConfig string, nodes []*corev1.Node) []*corev } nodeNotReady := !isNodeReady(node) - if dconfig != cconfig || nodeNotReady { + if dconfig == currentConfig && (dconfig != cconfig || nodeNotReady) { unavail = append(unavail, node) glog.V(2).Infof("Node %s unavailable: different configs (desired: %s, current %s) or node not ready %v", node.Name, dconfig, cconfig, nodeNotReady) } diff --git a/pkg/controller/node/status_test.go b/pkg/controller/node/status_test.go index 4a588ee293..e437ccd1e6 100644 --- a/pkg/controller/node/status_test.go +++ b/pkg/controller/node/status_test.go @@ -239,10 +239,7 @@ func TestGetUnavailableMachines(t *testing.T) { newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue), }, currentConfig: "v2", - unavail: []*corev1.Node{ - newNodeWithReady("node-0", "v0", "v1", corev1.ConditionTrue), - newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue), - }, + unavail: []*corev1.Node{newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue)}, }, { // 1 node updated, 1 updating, 1 updating but not v2 and is not ready nodes: []*corev1.Node{ @@ -251,10 +248,7 @@ func TestGetUnavailableMachines(t *testing.T) { newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue), }, currentConfig: "v2", - unavail: []*corev1.Node{ - newNodeWithReady("node-0", "v0", "v1", corev1.ConditionFalse), - newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue), - }, + unavail: []*corev1.Node{newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue)}, }, { // 2 node updated, 1 updating nodes: []*corev1.Node{ From 0688ba8bcffffb1f6debd4da312ae8fc9b799d58 Mon Sep 17 00:00:00 2001 From: Antonio Murdaca Date: Fri, 3 May 2019 20:26:18 +0200 Subject: [PATCH 10/11] store currentconfig under /etc since we have a bind from host --- pkg/daemon/daemon.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 4412646541..322cec26f8 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -135,7 +135,7 @@ const ( pathStateJSON = "/etc/machine-config-daemon/state.json" // currentConfigPath is where we store the current config on disk to validate // against annotations changes - currentConfigPath = "/var/machine-config-daemon/currentconfig" + currentConfigPath = "/etc/machine-config-daemon/currentconfig" kubeletHealthzEndpoint = "http://localhost:10248/healthz" kubeletHealthzPollingInterval = time.Duration(30 * time.Second) From dba25ce796c79d2b9ebc6aa7cc3885d85a7bfa4b Mon Sep 17 00:00:00 2001 From: Antonio Murdaca Date: Fri, 3 May 2019 22:48:38 +0200 Subject: [PATCH 11/11] pkg/daemon: use a channel to signal termination --- cmd/machine-config-daemon/start.go | 5 +++-- pkg/daemon/daemon.go | 8 +++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cmd/machine-config-daemon/start.go b/cmd/machine-config-daemon/start.go index d7596363ae..61dd94eeec 100644 --- a/cmd/machine-config-daemon/start.go +++ b/cmd/machine-config-daemon/start.go @@ -206,9 +206,10 @@ func runStartCmd(cmd *cobra.Command, args []string) { glog.Info("Starting MachineConfigDaemon") defer glog.Info("Shutting down MachineConfigDaemon") - dn.InstallSignalHandler(stopCh) + signaled := make(chan struct{}) + dn.InstallSignalHandler(signaled) - if err := dn.Run(stopCh, exitCh); err != nil { + if err := dn.Run(stopCh, signaled, exitCh); err != nil { glog.Fatalf("Failed to run: %v", err) } } diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 322cec26f8..d3384afb59 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -472,7 +472,7 @@ func (dn *Daemon) runOnceFrom() error { } // InstallSignalHandler installs the handler for the signals the daemon should act on -func (dn *Daemon) InstallSignalHandler(stopCh chan struct{}) { +func (dn *Daemon) InstallSignalHandler(signaled chan struct{}) { termChan := make(chan os.Signal, 2048) signal.Notify(termChan, syscall.SIGTERM) @@ -486,7 +486,7 @@ func (dn *Daemon) InstallSignalHandler(stopCh chan struct{}) { if dn.updateActive { glog.Info("Got SIGTERM, but actively updating") } else { - close(stopCh) + close(signaled) } } } @@ -496,7 +496,7 @@ func (dn *Daemon) InstallSignalHandler(stopCh chan struct{}) { // Run finishes informer setup and then blocks, and the informer will be // responsible for triggering callbacks to handle updates. Successful // updates shouldn't return, and should just reboot the node. -func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error { +func (dn *Daemon) Run(stopCh, signaled <-chan struct{}, exitCh <-chan error) error { if dn.kubeletHealthzEnabled { glog.Info("Enabling Kubelet Healthz Monitor") go dn.runKubeletHealthzMonitor(stopCh, dn.exitCh) @@ -520,6 +520,8 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error { select { case <-stopCh: return nil + case <-signaled: + return nil case err := <-exitCh: // This channel gets errors from auxiliary goroutines like loginmonitor and kubehealth glog.Warningf("Got an error from auxiliary tools: %v", err)