From d41155f8ec0cd0fdefdea7365971195145a7c0b9 Mon Sep 17 00:00:00 2001
From: Antonio Murdaca <runcom@linux.com>
Date: Thu, 2 May 2019 17:37:08 +0200
Subject: [PATCH 01/11] pkg/controller: do not progress if the pool isn't idle

The NodeController shouldn't rely just on what it's syncing at that
moment to deduce that a node is unavailable. It may happen that the pool
is updating to a rendered-config-A but it's not done, and the code was
still going to apply a new rendered-config-B causing more than 1 node at
the time to go unschedulable. This patch should fix that.
---
 pkg/controller/node/node_controller_test.go |  4 ++--
 pkg/controller/node/status.go               |  3 ++-
 pkg/controller/node/status_test.go          | 10 ++++++++--
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/pkg/controller/node/node_controller_test.go b/pkg/controller/node/node_controller_test.go
index 6a89bd6b76..0eb2aa1516 100644
--- a/pkg/controller/node/node_controller_test.go
+++ b/pkg/controller/node/node_controller_test.go
@@ -452,13 +452,13 @@ func TestGetCandidateMachines(t *testing.T) {
 		expected: []*corev1.Node{newNodeWithReady("node-3", "v0", "v0", corev1.ConditionTrue)},
 	}, {
 		//progress on old stuck node
-		progress: 1,
+		progress: 0,
 		nodes: []*corev1.Node{
 			newNodeWithReady("node-0", "v1", "v1", corev1.ConditionTrue),
 			newNodeWithReady("node-1", "v0.1", "v0.2", corev1.ConditionFalse),
 			newNodeWithReady("node-2", "v0", "v0", corev1.ConditionTrue),
 		},
-		expected: []*corev1.Node{newNodeWithReady("node-1", "v0.1", "v0.2", corev1.ConditionFalse)},
+		expected: []*corev1.Node{},
 	}}
 
 	for idx, test := range tests {
diff --git a/pkg/controller/node/status.go b/pkg/controller/node/status.go
index 75c1647783..daf98c00c8 100644
--- a/pkg/controller/node/status.go
+++ b/pkg/controller/node/status.go
@@ -177,6 +177,7 @@ func isNodeReady(node *corev1.Node) bool {
 	return true
 }
 
+// TODO(runcom): drop currentConfig arg
 func getUnavailableMachines(currentConfig string, nodes []*corev1.Node) []*corev1.Node {
 	var unavail []*corev1.Node
 	for _, node := range nodes {
@@ -193,7 +194,7 @@ func getUnavailableMachines(currentConfig string, nodes []*corev1.Node) []*corev
 		}
 
 		nodeNotReady := !isNodeReady(node)
-		if dconfig == currentConfig && (dconfig != cconfig || nodeNotReady) {
+		if dconfig != cconfig || nodeNotReady {
 			unavail = append(unavail, node)
 			glog.V(2).Infof("Node %s unavailable: different configs (desired: %s, current %s) or node not ready %v", node.Name, dconfig, cconfig, nodeNotReady)
 		}
diff --git a/pkg/controller/node/status_test.go b/pkg/controller/node/status_test.go
index e437ccd1e6..4a588ee293 100644
--- a/pkg/controller/node/status_test.go
+++ b/pkg/controller/node/status_test.go
@@ -239,7 +239,10 @@ func TestGetUnavailableMachines(t *testing.T) {
 			newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue),
 		},
 		currentConfig: "v2",
-		unavail:       []*corev1.Node{newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue)},
+		unavail: []*corev1.Node{
+			newNodeWithReady("node-0", "v0", "v1", corev1.ConditionTrue),
+			newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue),
+		},
 	}, {
 		// 1 node updated, 1 updating, 1 updating but not v2 and is not ready
 		nodes: []*corev1.Node{
@@ -248,7 +251,10 @@ func TestGetUnavailableMachines(t *testing.T) {
 			newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue),
 		},
 		currentConfig: "v2",
-		unavail:       []*corev1.Node{newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue)},
+		unavail:       []*corev1.Node{
+			newNodeWithReady("node-0", "v0", "v1", corev1.ConditionFalse),
+			newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue),
+		},
 	}, {
 		// 2 node updated, 1 updating
 		nodes: []*corev1.Node{

From 45ebf1c1dc040a87c74357f5fb146c83e36e5967 Mon Sep 17 00:00:00 2001
From: Antonio Murdaca <runcom@linux.com>
Date: Thu, 2 May 2019 20:03:51 +0200
Subject: [PATCH 02/11] test/e2e: increase timeout on checking nodes

---
 Makefile             | 2 +-
 test/e2e/mcd_test.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 644bf6261b..c179cbd798 100644
--- a/Makefile
+++ b/Makefile
@@ -98,4 +98,4 @@ images.rhel7: $(imc7)
 
 # This was copied from https://github.com/openshift/cluster-image-registry-operato
 test-e2e:
-	GOCACHE=off go test -timeout 55m -v$${WHAT:+ -run="$$WHAT"} ./test/e2e/
+	GOCACHE=off go test -timeout 75m -v$${WHAT:+ -run="$$WHAT"} ./test/e2e/
diff --git a/test/e2e/mcd_test.go b/test/e2e/mcd_test.go
index 20de940389..2012bd3f82 100644
--- a/test/e2e/mcd_test.go
+++ b/test/e2e/mcd_test.go
@@ -121,7 +121,7 @@ func TestMCDeployed(t *testing.T) {
 		}
 
 		visited := make(map[string]bool)
-		if err := wait.Poll(2*time.Second, 10*time.Minute, func() (bool, error) {
+		if err := wait.Poll(2*time.Second, 30*time.Minute, func() (bool, error) {
 			nodes, err := getNodesByRole(cs, "worker")
 			if err != nil {
 				return false, nil

From 44abdd51376a0f8e2d50f1f96fb6af11925c35b8 Mon Sep 17 00:00:00 2001
From: Antonio Murdaca <runcom@linux.com>
Date: Thu, 2 May 2019 21:54:46 +0200
Subject: [PATCH 03/11] pkg/daemon: adjust sigterm handler

https://bugzilla.redhat.com/show_bug.cgi?id=1703877
---
 cmd/machine-config-daemon/start.go |  2 ++
 pkg/daemon/daemon.go               | 26 ++++++++++++++++++++-
 pkg/daemon/update.go               | 36 ++++++++----------------------
 3 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/cmd/machine-config-daemon/start.go b/cmd/machine-config-daemon/start.go
index 69aad30c58..d7596363ae 100644
--- a/cmd/machine-config-daemon/start.go
+++ b/cmd/machine-config-daemon/start.go
@@ -206,6 +206,8 @@ func runStartCmd(cmd *cobra.Command, args []string) {
 	glog.Info("Starting MachineConfigDaemon")
 	defer glog.Info("Shutting down MachineConfigDaemon")
 
+	dn.InstallSignalHandler(stopCh)
+
 	if err := dn.Run(stopCh, exitCh); err != nil {
 		glog.Fatalf("Failed to run: %v", err)
 	}
diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go
index 14a3dedd11..1471f6b621 100644
--- a/pkg/daemon/daemon.go
+++ b/pkg/daemon/daemon.go
@@ -11,8 +11,10 @@ import (
 	"net/http"
 	"os"
 	"os/exec"
+	"os/signal"
 	"path/filepath"
 	"strings"
+	"syscall"
 	"time"
 
 	imgref "github.com/containers/image/docker/reference"
@@ -88,7 +90,7 @@ type Daemon struct {
 	kubeletHealthzEnabled  bool
 	kubeletHealthzEndpoint string
 
-	installedSigterm bool
+	updateActive bool
 
 	nodeWriter NodeWriter
 
@@ -469,6 +471,28 @@ func (dn *Daemon) runOnceFrom() error {
 	return errors.New("unsupported onceFrom type provided")
 }
 
+// InstallSignalHandler installs the handler for the signals the daemon should act on
+func (dn *Daemon) InstallSignalHandler(stopCh chan struct{}) {
+	termChan := make(chan os.Signal, 2048)
+	signal.Notify(termChan, syscall.SIGTERM)
+
+	// Catch SIGTERM - if we're actively updating, we should avoid
+	// having the process be killed.
+	// https://github.com/openshift/machine-config-operator/issues/407
+	go func() {
+		for sig := range termChan {
+			switch sig {
+			case syscall.SIGTERM:
+				if dn.updateActive {
+					glog.Info("Got SIGTERM, but actively updating")
+				} else {
+					close(stopCh)
+				}
+			}
+		}
+	}()
+}
+
 // Run finishes informer setup and then blocks, and the informer will be
 // responsible for triggering callbacks to handle updates. Successful
 // updates shouldn't return, and should just reboot the node.
diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go
index 6c122182b7..a8d565630b 100644
--- a/pkg/daemon/update.go
+++ b/pkg/daemon/update.go
@@ -7,12 +7,10 @@ import (
 	"io"
 	"os"
 	"os/exec"
-	"os/signal"
 	"os/user"
 	"path/filepath"
 	"reflect"
 	"strconv"
-	"syscall"
 	"time"
 
 	ignv2_2types "github.com/coreos/ignition/config/v2_2/types"
@@ -154,28 +152,6 @@ func (dn *Daemon) updateOSAndReboot(newConfig *mcfgv1.MachineConfig) (retErr err
 	return dn.reboot(fmt.Sprintf("Node will reboot into config %v", newConfig.GetName()), defaultRebootTimeout, exec.Command(defaultRebootCommand))
 }
 
-// isUpdating returns true if the MCD is actively applying an update
-func (dn *Daemon) catchIgnoreSIGTERM() {
-	if dn.installedSigterm {
-		return
-	}
-
-	termChan := make(chan os.Signal, 1)
-	signal.Notify(termChan, syscall.SIGTERM)
-
-	dn.installedSigterm = true
-
-	// Catch SIGTERM - if we're actively updating, we should avoid
-	// having the process be killed.
-	// https://github.com/openshift/machine-config-operator/issues/407
-	go func() {
-		for {
-			<-termChan
-			glog.Info("Got SIGTERM, but actively updating")
-		}
-	}()
-}
-
 var errUnreconcilable = errors.New("unreconcilable")
 
 // update the node to the provided node configuration.
@@ -751,10 +727,16 @@ func (dn *Daemon) logSystem(format string, a ...interface{}) {
 	}
 }
 
+func (dn *Daemon) catchIgnoreSIGTERM() {
+	if dn.updateActive {
+		return
+	}
+	dn.updateActive = true
+}
+
 func (dn *Daemon) cancelSIGTERM() {
-	if dn.installedSigterm {
-		signal.Reset(syscall.SIGTERM)
-		dn.installedSigterm = false
+	if dn.updateActive {
+		dn.updateActive = false
 	}
 }
 

From dc12aa40fb07cbefc0e85880c00bb6a4b7995b4d Mon Sep 17 00:00:00 2001
From: Antonio Murdaca <runcom@linux.com>
Date: Fri, 3 May 2019 10:50:39 +0200
Subject: [PATCH 04/11] test/e2e: bump pool's maxUnvailable

---
 test/e2e/mcd_test.go | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/test/e2e/mcd_test.go b/test/e2e/mcd_test.go
index 2012bd3f82..c4c38d8675 100644
--- a/test/e2e/mcd_test.go
+++ b/test/e2e/mcd_test.go
@@ -1,6 +1,7 @@
 package e2e_test
 
 import (
+	"encoding/json"
 	"fmt"
 	"os/exec"
 	"strings"
@@ -11,10 +12,14 @@ import (
 	mcv1 "github.com/openshift/machine-config-operator/pkg/apis/machineconfiguration.openshift.io/v1"
 	"github.com/openshift/machine-config-operator/pkg/daemon/constants"
 	"github.com/openshift/machine-config-operator/test/e2e/framework"
+	"github.com/stretchr/testify/require"
 	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/fields"
 	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/apimachinery/pkg/util/intstr"
+	"k8s.io/apimachinery/pkg/util/jsonmergepatch"
 	"k8s.io/apimachinery/pkg/util/uuid"
 	"k8s.io/apimachinery/pkg/util/wait"
 )
@@ -92,8 +97,10 @@ func createMCToAddFile(name, filename, data, fs string) *mcv1.MachineConfig {
 
 func TestMCDeployed(t *testing.T) {
 	cs := framework.NewClientSet("")
+	bumpPoolMaxUnavailableTo(t, cs, 3)
 
-	for i := 0; i < 10; i++ {
+	// TODO: bring this back to 10
+	for i := 0; i < 5; i++ {
 		mcadd := createMCToAddFile("add-a-file", fmt.Sprintf("/etc/mytestconf%d", i), "test", "root")
 
 		// create the dummy MC now
@@ -145,8 +152,24 @@ func TestMCDeployed(t *testing.T) {
 	}
 }
 
+func bumpPoolMaxUnavailableTo(t *testing.T, cs *framework.ClientSet, max int) {
+	pool, err := cs.MachineConfigPools().Get("worker", metav1.GetOptions{})
+	require.Nil(t, err)
+	old, err := json.Marshal(pool)
+	require.Nil(t, err)
+	maxUnavailable := intstr.FromInt(max)
+	pool.Spec.MaxUnavailable = &maxUnavailable
+	new, err := json.Marshal(pool)
+	require.Nil(t, err)
+	patch, err := jsonmergepatch.CreateThreeWayJSONMergePatch(old, new, old)
+	require.Nil(t, err)
+	_, err = cs.MachineConfigPools().Patch("worker", types.MergePatchType, patch)
+	require.Nil(t, err)
+}
+
 func TestUpdateSSH(t *testing.T) {
 	cs := framework.NewClientSet("")
+	bumpPoolMaxUnavailableTo(t, cs, 3)
 
 	// create a dummy MC with an sshKey for user Core
 	mcName := fmt.Sprintf("sshkeys-worker-%s", uuid.NewUUID())
@@ -310,6 +333,7 @@ func TestPoolDegradedOnFailToRender(t *testing.T) {
 
 func TestReconcileAfterBadMC(t *testing.T) {
 	cs := framework.NewClientSet("")
+	bumpPoolMaxUnavailableTo(t, cs, 3)
 
 	// create a bad MC w/o a filesystem field which is going to fail reconciling
 	mcadd := createMCToAddFile("add-a-file", "/etc/mytestconfs", "test", "")
@@ -372,7 +396,7 @@ func TestReconcileAfterBadMC(t *testing.T) {
 		if err != nil {
 			return false, err
 		}
-		if mcp.Status.UnavailableMachineCount == 1 {
+		if mcp.Status.UnavailableMachineCount >= 1 {
 			return true, nil
 		}
 		return false, nil
@@ -400,7 +424,7 @@ func TestReconcileAfterBadMC(t *testing.T) {
 	}
 
 	visited := make(map[string]bool)
-	if err := wait.Poll(2*time.Second, 10*time.Minute, func() (bool, error) {
+	if err := wait.Poll(2*time.Second, 30*time.Minute, func() (bool, error) {
 		nodes, err := getNodesByRole(cs, "worker")
 		if err != nil {
 			return false, err

From 87f4cc96fc9e8744a1a014674a3739ad6ce090e0 Mon Sep 17 00:00:00 2001
From: Colin Walters <walters@verbum.org>
Date: Thu, 2 May 2019 15:07:59 +0000
Subject: [PATCH 05/11] controller: Log when a node has been updated

The MCC logs show when we tell a node to start, but don't show us
when it's done.  This helps the MCC logs be a centralized
aggregate of events.

Doing this now to aid debugging slowness in updates in our `e2e-aws-op`
test suite.
---
 pkg/controller/node/node_controller.go | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pkg/controller/node/node_controller.go b/pkg/controller/node/node_controller.go
index 04fcd87258..0765634feb 100644
--- a/pkg/controller/node/node_controller.go
+++ b/pkg/controller/node/node_controller.go
@@ -200,7 +200,14 @@ func (ctrl *Controller) updateNode(old, cur interface{}) {
 	if pool == nil {
 		return
 	}
-	glog.V(4).Infof("Node %s updated", curNode.Name)
+
+	// Specifically log when a node has completed an update so the MCC logs are a useful central aggregate of state changes
+	if oldNode.Annotations[daemonconsts.CurrentMachineConfigAnnotationKey] != oldNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] &&
+		curNode.Annotations[daemonconsts.CurrentMachineConfigAnnotationKey] == curNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey] {
+		glog.Infof("Pool %s: node %s has completed update to %s", pool.Name, curNode.Name, curNode.Annotations[daemonconsts.DesiredMachineConfigAnnotationKey])
+	} else {
+		glog.V(4).Infof("Node %s updated", curNode.Name)
+	}
 	ctrl.enqueueMachineConfigPool(pool)
 }
 

From 2a19206dc5c74821bc48ad379f4561cb03ec7a95 Mon Sep 17 00:00:00 2001
From: Colin Walters <walters@verbum.org>
Date: Fri, 3 May 2019 09:05:02 +0000
Subject: [PATCH 06/11] status: Add some doc comments

General principle.
---
 pkg/controller/node/status.go | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pkg/controller/node/status.go b/pkg/controller/node/status.go
index daf98c00c8..66d4667d73 100644
--- a/pkg/controller/node/status.go
+++ b/pkg/controller/node/status.go
@@ -111,6 +111,8 @@ func calculateStatus(pool *mcfgv1.MachineConfigPool, nodes []*corev1.Node) mcfgv
 	return status
 }
 
+// getUpdatedMachines filters the provided nodes to ones whose current == desired
+// and the "done" flag is set.
 func getUpdatedMachines(currentConfig string, nodes []*corev1.Node) []*corev1.Node {
 	var updated []*corev1.Node
 	for _, node := range nodes {
@@ -138,6 +140,8 @@ func getUpdatedMachines(currentConfig string, nodes []*corev1.Node) []*corev1.No
 	return updated
 }
 
+// getReadyMachines filters the provided nodes to ones which are updated
+// and marked ready.
 func getReadyMachines(currentConfig string, nodes []*corev1.Node) []*corev1.Node {
 	updated := getUpdatedMachines(currentConfig, nodes)
 	var ready []*corev1.Node
@@ -178,6 +182,9 @@ func isNodeReady(node *corev1.Node) bool {
 }
 
 // TODO(runcom): drop currentConfig arg
+// getUnavailableMachines is the opposite of getReadyNodes - it filters the provided
+// set of nodes to ones which are either not ready, or whose current config does
+// not match the target.g
 func getUnavailableMachines(currentConfig string, nodes []*corev1.Node) []*corev1.Node {
 	var unavail []*corev1.Node
 	for _, node := range nodes {

From 60c9243451ba0f155da3a3e7cdc0028a37e972c9 Mon Sep 17 00:00:00 2001
From: Colin Walters <walters@verbum.org>
Date: Fri, 3 May 2019 09:13:28 +0000
Subject: [PATCH 07/11] daemon: Log drain timings to journal too

The journal saves our history in a reliable way, let's log more
things there too so we can debug e.g. how long things took.

Also consistently log to the journal with the `machine-config-daemon: `
prefix (note the colon `:`) so it's easy to do:
`journalctl -r |grep machine-config-daemon:`
---
 pkg/daemon/daemon.go | 2 +-
 pkg/daemon/update.go | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go
index 1471f6b621..6921f39878 100644
--- a/pkg/daemon/daemon.go
+++ b/pkg/daemon/daemon.go
@@ -1000,7 +1000,7 @@ func (dn *Daemon) completeUpdate(node *corev1.Node, desiredConfigName string) er
 		return err
 	}
 
-	dn.logSystem("machine-config-daemon: completed update for config %s", desiredConfigName)
+	dn.logSystem("completed update for config %s", desiredConfigName)
 
 	return nil
 }
diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go
index a8d565630b..4a6aa76d8f 100644
--- a/pkg/daemon/update.go
+++ b/pkg/daemon/update.go
@@ -116,7 +116,7 @@ func (dn *Daemon) updateOSAndReboot(newConfig *mcfgv1.MachineConfig) (retErr err
 
 	// Skip draining of the node when we're not cluster driven
 	if dn.onceFrom == "" {
-		glog.Info("Update prepared; draining the node")
+		dn.logSystem("Update prepared; beginning drain")
 
 		dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "Drain", "Draining node to update config.")
 
@@ -145,7 +145,7 @@ func (dn *Daemon) updateOSAndReboot(newConfig *mcfgv1.MachineConfig) (retErr err
 			}
 			return errors.Wrap(err, "failed to drain node")
 		}
-		glog.Info("Node successfully drained")
+		dn.logSystem("drain complete")
 	}
 
 	// reboot. this function shouldn't actually return.
@@ -718,6 +718,7 @@ func (dn *Daemon) logSystem(format string, a ...interface{}) {
 
 	go func() {
 		defer stdin.Close()
+		io.WriteString(stdin, "machine-config-daemon: ")
 		io.WriteString(stdin, message)
 	}()
 	err = logger.Run()
@@ -748,7 +749,7 @@ func (dn *Daemon) reboot(rationale string, timeout time.Duration, rebootCmd *exe
 	if dn.recorder != nil {
 		dn.recorder.Eventf(getNodeRef(dn.node), corev1.EventTypeNormal, "Reboot", rationale)
 	}
-	dn.logSystem("machine-config-daemon initiating reboot: %s", rationale)
+	dn.logSystem("initiating reboot: %s", rationale)
 
 	// Now that everything is done, avoid delaying shutdown.
 	dn.cancelSIGTERM()

From b042fecb3d40ed4707b373f225f350d298560a01 Mon Sep 17 00:00:00 2001
From: Colin Walters <walters@verbum.org>
Date: Fri, 3 May 2019 09:19:06 +0000
Subject: [PATCH 08/11] daemon: Log to journal MCD startup and initating config
 write

For more debugging.
---
 pkg/daemon/daemon.go | 2 +-
 pkg/daemon/update.go | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go
index 6921f39878..4412646541 100644
--- a/pkg/daemon/daemon.go
+++ b/pkg/daemon/daemon.go
@@ -268,7 +268,7 @@ func NewClusterDrivenDaemon(
 	eventBroadcaster.StartRecordingToSink(&clientsetcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")})
 	dn.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "machineconfigdaemon", Host: nodeName})
 
-	glog.Infof("Managing node: %s", nodeName)
+	dn.logSystem("Starting to manage node: %s", nodeName)
 
 	go dn.runLoginMonitor(dn.stopCh, dn.exitCh)
 
diff --git a/pkg/daemon/update.go b/pkg/daemon/update.go
index 4a6aa76d8f..f7dfafc4ff 100644
--- a/pkg/daemon/update.go
+++ b/pkg/daemon/update.go
@@ -194,6 +194,7 @@ func (dn *Daemon) update(oldConfig, newConfig *mcfgv1.MachineConfig) (retErr err
 		dn.logSystem(wrappedErr.Error())
 		return errors.Wrapf(errUnreconcilable, "%v", wrappedErr)
 	}
+	dn.logSystem("Starting update from %s to %s", oldConfigName, newConfigName)
 
 	// update files on disk that need updating
 	if err := dn.updateFiles(oldConfig, newConfig); err != nil {

From 8b8bef915bb4c2e97aabfd5b67af445455a9a105 Mon Sep 17 00:00:00 2001
From: Antonio Murdaca <runcom@linux.com>
Date: Fri, 3 May 2019 20:23:44 +0200
Subject: [PATCH 09/11] Revert "pkg/controller: do not progress if the pool
 isn't idle"

This reverts commit d41155f8ec0cd0fdefdea7365971195145a7c0b9.
---
 pkg/controller/node/node_controller_test.go |  4 ++--
 pkg/controller/node/status.go               |  6 +-----
 pkg/controller/node/status_test.go          | 10 ++--------
 3 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/pkg/controller/node/node_controller_test.go b/pkg/controller/node/node_controller_test.go
index 0eb2aa1516..6a89bd6b76 100644
--- a/pkg/controller/node/node_controller_test.go
+++ b/pkg/controller/node/node_controller_test.go
@@ -452,13 +452,13 @@ func TestGetCandidateMachines(t *testing.T) {
 		expected: []*corev1.Node{newNodeWithReady("node-3", "v0", "v0", corev1.ConditionTrue)},
 	}, {
 		//progress on old stuck node
-		progress: 0,
+		progress: 1,
 		nodes: []*corev1.Node{
 			newNodeWithReady("node-0", "v1", "v1", corev1.ConditionTrue),
 			newNodeWithReady("node-1", "v0.1", "v0.2", corev1.ConditionFalse),
 			newNodeWithReady("node-2", "v0", "v0", corev1.ConditionTrue),
 		},
-		expected: []*corev1.Node{},
+		expected: []*corev1.Node{newNodeWithReady("node-1", "v0.1", "v0.2", corev1.ConditionFalse)},
 	}}
 
 	for idx, test := range tests {
diff --git a/pkg/controller/node/status.go b/pkg/controller/node/status.go
index 66d4667d73..4ef5b48d64 100644
--- a/pkg/controller/node/status.go
+++ b/pkg/controller/node/status.go
@@ -181,10 +181,6 @@ func isNodeReady(node *corev1.Node) bool {
 	return true
 }
 
-// TODO(runcom): drop currentConfig arg
-// getUnavailableMachines is the opposite of getReadyNodes - it filters the provided
-// set of nodes to ones which are either not ready, or whose current config does
-// not match the target.g
 func getUnavailableMachines(currentConfig string, nodes []*corev1.Node) []*corev1.Node {
 	var unavail []*corev1.Node
 	for _, node := range nodes {
@@ -201,7 +197,7 @@ func getUnavailableMachines(currentConfig string, nodes []*corev1.Node) []*corev
 		}
 
 		nodeNotReady := !isNodeReady(node)
-		if dconfig != cconfig || nodeNotReady {
+		if dconfig == currentConfig && (dconfig != cconfig || nodeNotReady) {
 			unavail = append(unavail, node)
 			glog.V(2).Infof("Node %s unavailable: different configs (desired: %s, current %s) or node not ready %v", node.Name, dconfig, cconfig, nodeNotReady)
 		}
diff --git a/pkg/controller/node/status_test.go b/pkg/controller/node/status_test.go
index 4a588ee293..e437ccd1e6 100644
--- a/pkg/controller/node/status_test.go
+++ b/pkg/controller/node/status_test.go
@@ -239,10 +239,7 @@ func TestGetUnavailableMachines(t *testing.T) {
 			newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue),
 		},
 		currentConfig: "v2",
-		unavail: []*corev1.Node{
-			newNodeWithReady("node-0", "v0", "v1", corev1.ConditionTrue),
-			newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue),
-		},
+		unavail:       []*corev1.Node{newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue)},
 	}, {
 		// 1 node updated, 1 updating, 1 updating but not v2 and is not ready
 		nodes: []*corev1.Node{
@@ -251,10 +248,7 @@ func TestGetUnavailableMachines(t *testing.T) {
 			newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue),
 		},
 		currentConfig: "v2",
-		unavail:       []*corev1.Node{
-			newNodeWithReady("node-0", "v0", "v1", corev1.ConditionFalse),
-			newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue),
-		},
+		unavail:       []*corev1.Node{newNodeWithReady("node-2", "v0", "v2", corev1.ConditionTrue)},
 	}, {
 		// 2 node updated, 1 updating
 		nodes: []*corev1.Node{

From 0688ba8bcffffb1f6debd4da312ae8fc9b799d58 Mon Sep 17 00:00:00 2001
From: Antonio Murdaca <runcom@linux.com>
Date: Fri, 3 May 2019 20:26:18 +0200
Subject: [PATCH 10/11] store currentconfig under /etc since we have a bind
 from host

---
 pkg/daemon/daemon.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go
index 4412646541..322cec26f8 100644
--- a/pkg/daemon/daemon.go
+++ b/pkg/daemon/daemon.go
@@ -135,7 +135,7 @@ const (
 	pathStateJSON = "/etc/machine-config-daemon/state.json"
 	// currentConfigPath is where we store the current config on disk to validate
 	// against annotations changes
-	currentConfigPath = "/var/machine-config-daemon/currentconfig"
+	currentConfigPath = "/etc/machine-config-daemon/currentconfig"
 
 	kubeletHealthzEndpoint         = "http://localhost:10248/healthz"
 	kubeletHealthzPollingInterval  = time.Duration(30 * time.Second)

From dba25ce796c79d2b9ebc6aa7cc3885d85a7bfa4b Mon Sep 17 00:00:00 2001
From: Antonio Murdaca <runcom@linux.com>
Date: Fri, 3 May 2019 22:48:38 +0200
Subject: [PATCH 11/11] pkg/daemon: use a channel to signal termination

---
 cmd/machine-config-daemon/start.go | 5 +++--
 pkg/daemon/daemon.go               | 8 +++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/cmd/machine-config-daemon/start.go b/cmd/machine-config-daemon/start.go
index d7596363ae..61dd94eeec 100644
--- a/cmd/machine-config-daemon/start.go
+++ b/cmd/machine-config-daemon/start.go
@@ -206,9 +206,10 @@ func runStartCmd(cmd *cobra.Command, args []string) {
 	glog.Info("Starting MachineConfigDaemon")
 	defer glog.Info("Shutting down MachineConfigDaemon")
 
-	dn.InstallSignalHandler(stopCh)
+	signaled := make(chan struct{})
+	dn.InstallSignalHandler(signaled)
 
-	if err := dn.Run(stopCh, exitCh); err != nil {
+	if err := dn.Run(stopCh, signaled, exitCh); err != nil {
 		glog.Fatalf("Failed to run: %v", err)
 	}
 }
diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go
index 322cec26f8..d3384afb59 100644
--- a/pkg/daemon/daemon.go
+++ b/pkg/daemon/daemon.go
@@ -472,7 +472,7 @@ func (dn *Daemon) runOnceFrom() error {
 }
 
 // InstallSignalHandler installs the handler for the signals the daemon should act on
-func (dn *Daemon) InstallSignalHandler(stopCh chan struct{}) {
+func (dn *Daemon) InstallSignalHandler(signaled chan struct{}) {
 	termChan := make(chan os.Signal, 2048)
 	signal.Notify(termChan, syscall.SIGTERM)
 
@@ -486,7 +486,7 @@ func (dn *Daemon) InstallSignalHandler(stopCh chan struct{}) {
 				if dn.updateActive {
 					glog.Info("Got SIGTERM, but actively updating")
 				} else {
-					close(stopCh)
+					close(signaled)
 				}
 			}
 		}
@@ -496,7 +496,7 @@ func (dn *Daemon) InstallSignalHandler(stopCh chan struct{}) {
 // Run finishes informer setup and then blocks, and the informer will be
 // responsible for triggering callbacks to handle updates. Successful
 // updates shouldn't return, and should just reboot the node.
-func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error {
+func (dn *Daemon) Run(stopCh, signaled <-chan struct{}, exitCh <-chan error) error {
 	if dn.kubeletHealthzEnabled {
 		glog.Info("Enabling Kubelet Healthz Monitor")
 		go dn.runKubeletHealthzMonitor(stopCh, dn.exitCh)
@@ -520,6 +520,8 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error {
 		select {
 		case <-stopCh:
 			return nil
+		case <-signaled:
+			return nil
 		case err := <-exitCh:
 			// This channel gets errors from auxiliary goroutines like loginmonitor and kubehealth
 			glog.Warningf("Got an error from auxiliary tools: %v", err)