From d60ced2a88827470c4f8675830d6c8ab468192bc Mon Sep 17 00:00:00 2001 From: Jin Xu Date: Sat, 9 Jul 2016 13:47:13 +0800 Subject: [PATCH] Fix race condition between task creation and removal This commit fixes a race condition between task creation and removal which could causes orphan containers. It happens when a agent task is handling creation event and creating a container, a removal event comes in and cancels the task's context. Both the creation and removal would fail but the container would still created by the docker daemon. The removal fails because when the DELETE request arrives the docker daemon, the container is still being created. goroutine (creation) task (removal) --------- ---- ... ... ctlr.Prepare(ctx) r.adapter.createNetworks(ctx) ... case <-shutdown r.adapter.create(ctx) cancel() //failed due to context cancelled tm.ctlr.Remove(ctx) //container still createdby daemon //failed due to "No such //container" error because //the container not created //yet ... //the container created by docker //daemon but becomes orphan This patch fixes it by moving the cancel operation to be after the remove operation. In addition, a lock is added for exclusive access to the container resouces between creation and removal operations. These ensures that the creation of a container could either be finished or not started when the remove operation enters the critical section. The pull operation is moved out the critical section because it might be too time consuming. Signed-off-by: Jin Xu --- agent/exec/container/controller.go | 63 +++++++++++++++++++++++++----- agent/exec/errors.go | 3 ++ agent/task.go | 9 +++-- 3 files changed, 61 insertions(+), 14 deletions(-) diff --git a/agent/exec/container/controller.go b/agent/exec/container/controller.go index fe07576a51..a6f8cb4afd 100644 --- a/agent/exec/container/controller.go +++ b/agent/exec/container/controller.go @@ -2,6 +2,7 @@ package container import ( "fmt" + "sync" engineapi "github.com/docker/engine-api/client" "github.com/docker/engine-api/types" @@ -13,6 +14,14 @@ import ( "golang.org/x/net/context" ) +type controllerState int + +const ( + stateNew controllerState = iota + stateCreated + stateRemoved +) + // controller implements agent.Controller against docker's API. // // Most operations against docker's API are done through the container name, @@ -23,6 +32,8 @@ type controller struct { adapter *containerAdapter closed chan struct{} err error + state controllerState + mtx sync.Mutex } var _ exec.Controller = &controller{} @@ -39,6 +50,7 @@ func newController(client engineapi.APIClient, task *api.Task) (exec.Controller, task: task, adapter: adapter, closed: make(chan struct{}), + state: stateNew, }, nil } @@ -76,16 +88,6 @@ func (r *controller) Prepare(ctx context.Context) error { return err } - // Make sure all the networks that the task needs are created. - if err := r.adapter.createNetworks(ctx); err != nil { - return err - } - - // Make sure all the volumes that the task needs are created. - if err := r.adapter.createVolumes(ctx, r.client); err != nil { - return err - } - if err := r.adapter.pullImage(ctx); err != nil { // NOTE(stevvooe): We always try to pull the image to make sure we have // the most up to date version. This will return an error, but we only @@ -101,6 +103,27 @@ func (r *controller) Prepare(ctx context.Context) error { log.G(ctx).WithError(err).Error("pulling image failed") } + // Make sure the controller has not been removed. Because we + // might lost race to a remover. + r.mtx.Lock() + defer r.mtx.Unlock() + + if r.checkState(stateRemoved) { + return exec.ErrControllerRemoved + } + + r.updateState(stateCreated) + + // Make sure all the networks that the task needs are created. + if err := r.adapter.createNetworks(ctx); err != nil { + return err + } + + // Make sure all the volumes that the task needs are created. + if err := r.adapter.createVolumes(ctx, r.client); err != nil { + return err + } + if err := r.adapter.create(ctx); err != nil { if isContainerCreateNameConflict(err) { if _, err := r.adapter.inspect(ctx); err != nil { @@ -322,6 +345,16 @@ func (r *controller) Remove(ctx context.Context) error { return err } + r.mtx.Lock() + defer r.mtx.Unlock() + + if !r.checkState(stateCreated) { + return nil //nothing to remove + } + + //Transition to removed state no matter it will succeed or fail. + r.updateState(stateRemoved) + // It may be necessary to shut down the task before removing it. if err := r.Shutdown(ctx); err != nil { if isUnknownContainer(err) { @@ -389,6 +422,16 @@ func (r *controller) checkClosed() error { } } +//Caller should lock if necessary +func (r *controller) checkState(desiredState controllerState) bool { + return r.state == desiredState +} + +//Caller should lock if necessary +func (r *controller) updateState(newState controllerState) { + r.state = newState +} + type exitError struct { code int cause error diff --git a/agent/exec/errors.go b/agent/exec/errors.go index 4d082b76fe..4d1c20b591 100644 --- a/agent/exec/errors.go +++ b/agent/exec/errors.go @@ -22,6 +22,9 @@ var ( // ErrControllerClosed returned when a task controller has been closed. ErrControllerClosed = errors.New("exec: controller closed") + // ErrControllerRemoved returned when a task controller has been removed. + ErrControllerRemoved = errors.New("exec: controller removed") + // ErrTaskRetry is returned by Do when an operation failed by should be // retried. The status should still be reported in this case. ErrTaskRetry = errors.New("exec: task retry") diff --git a/agent/task.go b/agent/task.go index 797f93c769..40c1537928 100644 --- a/agent/task.go +++ b/agent/task.go @@ -200,10 +200,6 @@ func (tm *taskManager) run(ctx context.Context) { } } case <-shutdown: - if cancel != nil { - // cancel outstanding operation. - cancel() - } // TODO(stevvooe): This should be left for the repear. @@ -213,6 +209,11 @@ func (tm *taskManager) run(ctx context.Context) { log.G(ctx).WithError(err).WithField("task.id", tm.task.ID).Error("remove task failed") } + if cancel != nil { + // cancel outstanding operation. + cancel() + } + if err := tm.ctlr.Close(); err != nil { log.G(ctx).WithError(err).Error("error closing controller") }