From 5d6f7be936b5007440d36e3c1a4b3ebd0aeba572 Mon Sep 17 00:00:00 2001 From: Drew Erny Date: Mon, 14 Jan 2019 12:36:48 -0600 Subject: [PATCH] Fix leaking task resources when nodes are deleted When a node is deleted, its tasks are asked to restart, which involves putting them into a desired state of Shutdown. However, the Allocator will not deallocate a task which is not in an actual state of a terminal state. Once a node is deleted, the only opportunity for its tasks to recieve updates and be moved to a terminal state is when the function moving those tasks to TaskStateOrphaned is called, 24 hours after the node enters the Down state. However, if a leadership change occurs, then that function will never be called, and the tasks will never be moved to a terminal state, leaking resources. With this change, upon node deletion, all of its tasks will be moved to TaskStateOrphaned, allowing those tasks' resources to be cleaned up. Signed-off-by: Drew Erny (cherry picked from commit 8467e6a43f9cee66f46efb457b2c22f46ae5da0b) Signed-off-by: Sebastiaan van Stijn --- manager/controlapi/node.go | 26 ++++++++++++-------------- manager/controlapi/node_test.go | 16 ++++++++-------- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/manager/controlapi/node.go b/manager/controlapi/node.go index 68a759fc02..5308b7419e 100644 --- a/manager/controlapi/node.go +++ b/manager/controlapi/node.go @@ -254,25 +254,23 @@ func (s *Server) UpdateNode(ctx context.Context, request *api.UpdateNodeRequest) }, nil } -func removeNodeAttachments(tx store.Tx, nodeID string) error { - // orphan the node's attached containers. if we don't do this, the - // network these attachments are connected to will never be removeable +func orphanNodeTasks(tx store.Tx, nodeID string) error { + // when a node is deleted, all of its tasks are irrecoverably removed. + // additionally, the Dispatcher can no longer be relied on to update the + // task status. Therefore, when the node is removed, we must additionally + // move all of its assigned tasks to the Orphaned state, so that their + // resources can be cleaned up. tasks, err := store.FindTasks(tx, store.ByNodeID(nodeID)) if err != nil { return err } for _, task := range tasks { - // if the task is an attachment, then we just delete it. the allocator - // will do the heavy lifting. basically, GetAttachment will return the - // attachment if that's the kind of runtime, or nil if it's not. - if task.Spec.GetAttachment() != nil { - // don't delete the task. instead, update it to `ORPHANED` so that - // the taskreaper will clean it up. - task.Status.State = api.TaskStateOrphaned - if err := store.UpdateTask(tx, task); err != nil { - return err - } + task.Status = api.TaskStatus{ + Timestamp: gogotypes.TimestampNow(), + State: api.TaskStateOrphaned, + Message: "Task belonged to a node that has been deleted", } + store.UpdateTask(tx, task) } return nil } @@ -342,7 +340,7 @@ func (s *Server) RemoveNode(ctx context.Context, request *api.RemoveNodeRequest) return err } - if err := removeNodeAttachments(tx, request.NodeID); err != nil { + if err := orphanNodeTasks(tx, request.NodeID); err != nil { return err } diff --git a/manager/controlapi/node_test.go b/manager/controlapi/node_test.go index 00a5d9b7a3..4b0e7a81f0 100644 --- a/manager/controlapi/node_test.go +++ b/manager/controlapi/node_test.go @@ -943,9 +943,7 @@ func TestUpdateNodeDemote(t *testing.T) { } // TestRemoveNodeAttachments tests the unexported removeNodeAttachments -// function. This avoids us having to update the TestRemoveNodes function to -// test all of this logic -func TestRemoveNodeAttachments(t *testing.T) { +func TestOrphanNodeTasks(t *testing.T) { // first, set up a store and all that ts := newTestServer(t) defer ts.Stop() @@ -1089,22 +1087,24 @@ func TestRemoveNodeAttachments(t *testing.T) { // Now, call the function with our nodeID. make sure it returns no error err = ts.Store.Update(func(tx store.Tx) error { - return removeNodeAttachments(tx, "id2") + return orphanNodeTasks(tx, "id2") }) require.NoError(t, err) - // Now, make sure only task1, the network-attacahed task on id2, was - // removed + // Now, make sure only tasks 1 and 3, the tasks on the node we're deleting + // removed, are removed ts.Store.View(func(tx store.ReadTx) { tasks, err := store.FindTasks(tx, store.All) require.NoError(t, err) // should only be 3 tasks left require.Len(t, tasks, 4) - // and the list should not contain task1 + // and the list should not contain task1 or task2 for _, task := range tasks { require.NotNil(t, task) - if task.ID == "task1" { + if task.ID == "task1" || task.ID == "task3" { require.Equal(t, task.Status.State, api.TaskStateOrphaned) + } else { + require.NotEqual(t, task.Status.State, api.TaskStateOrphaned) } } })