From d401f74244ee98b61e1eb2931ea995cfea2e6fad Mon Sep 17 00:00:00 2001 From: Tonis Tiigi Date: Mon, 15 Jul 2019 17:23:55 -0700 Subject: [PATCH 1/4] integration-cli: allow temporary errors on leader switch Signed-off-by: Tonis Tiigi (cherry picked from commit 3df1095bbdc331d4effa5452d8aafd5aaead5789) Signed-off-by: Sebastiaan van Stijn --- integration-cli/docker_api_swarm_test.go | 16 ++++++++++++++-- internal/test/daemon/node.go | 9 ++++++++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/integration-cli/docker_api_swarm_test.go b/integration-cli/docker_api_swarm_test.go index e7d1b983322fc..7b498d5ea9e6f 100644 --- a/integration-cli/docker_api_swarm_test.go +++ b/integration-cli/docker_api_swarm_test.go @@ -27,6 +27,7 @@ import ( "github.com/docker/docker/internal/test/request" "github.com/docker/swarmkit/ca" "github.com/go-check/check" + "github.com/pkg/errors" "gotest.tools/assert" is "gotest.tools/assert/cmp" ) @@ -313,13 +314,24 @@ func (s *DockerSwarmSuite) TestAPISwarmLeaderElection(c *check.C) { leader *daemon.Daemon // keep track of leader followers []*daemon.Daemon // keep track of followers ) + var lastErr error checkLeader := func(nodes ...*daemon.Daemon) checkF { return func(c *check.C) (interface{}, check.CommentInterface) { // clear these out before each run leader = nil followers = nil for _, d := range nodes { - if d.GetNode(c, d.NodeID()).ManagerStatus.Leader { + n := d.GetNode(c, d.NodeID(), func(err error) bool { + if strings.Contains(errors.Cause(err).Error(), context.DeadlineExceeded.Error()) || strings.Contains(err.Error(), "swarm does not have a leader") { + lastErr = err + return true + } + return false + }) + if n == nil { + return false, check.Commentf("failed to get node: %v", lastErr) + } + if n.ManagerStatus.Leader { leader = d } else { followers = append(followers, d) @@ -391,7 +403,7 @@ func (s *DockerSwarmSuite) TestAPISwarmRaftQuorum(c *check.C) { defer cli.Close() // d1 will eventually step down from leader because there is no longer an active quorum, wait for that to happen - waitAndAssert(c, defaultReconciliationTimeout, func(c *check.C) (interface{}, check.CommentInterface) { + waitAndAssert(c, defaultReconciliationTimeout*2, func(c *check.C) (interface{}, check.CommentInterface) { _, err := cli.ServiceCreate(context.Background(), service.Spec, types.ServiceCreateOptions{}) return err.Error(), nil }, checker.Contains, "Make sure more than half of the managers are online.") diff --git a/internal/test/daemon/node.go b/internal/test/daemon/node.go index 33dd365429531..d5f6f15263fa4 100644 --- a/internal/test/daemon/node.go +++ b/internal/test/daemon/node.go @@ -15,7 +15,7 @@ import ( type NodeConstructor func(*swarm.Node) // GetNode returns a swarm node identified by the specified id -func (d *Daemon) GetNode(t assert.TestingT, id string) *swarm.Node { +func (d *Daemon) GetNode(t assert.TestingT, id string, errCheck ...func(error) bool) *swarm.Node { if ht, ok := t.(test.HelperT); ok { ht.Helper() } @@ -23,6 +23,13 @@ func (d *Daemon) GetNode(t assert.TestingT, id string) *swarm.Node { defer cli.Close() node, _, err := cli.NodeInspectWithRaw(context.Background(), id) + if err != nil { + for _, f := range errCheck { + if f(err) { + return nil + } + } + } assert.NilError(t, err, "[%s] (*Daemon).GetNode: NodeInspectWithRaw(%q) failed", d.id, id) assert.Check(t, node.ID == id) return &node From 54fac123544deddad9f5fd880b50edb87ae43f9f Mon Sep 17 00:00:00 2001 From: Tonis Tiigi Date: Tue, 16 Jul 2019 14:07:26 -0700 Subject: [PATCH 2/4] integration-cli: allow temporary no-leader error Signed-off-by: Tonis Tiigi (cherry picked from commit 52e0dfef9090fa3c6003115a2c82238b189ebe42) Signed-off-by: Sebastiaan van Stijn --- integration-cli/docker_cli_swarm_test.go | 36 ++++++++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/integration-cli/docker_cli_swarm_test.go b/integration-cli/docker_cli_swarm_test.go index 1b20cbf95d597..ca6d03ee0915a 100644 --- a/integration-cli/docker_cli_swarm_test.go +++ b/integration-cli/docker_cli_swarm_test.go @@ -1303,9 +1303,21 @@ func (s *DockerSwarmSuite) TestSwarmRotateUnlockKey(c *check.C) { c.Assert(getNodeStatus(c, d), checker.Equals, swarm.LocalNodeStateActive) - outs, err = d.Cmd("node", "ls") - assert.NilError(c, err) - c.Assert(outs, checker.Not(checker.Contains), "Swarm is encrypted and needs to be unlocked") + retry := 0 + for { + // an issue sometimes prevents leader to be available right away + outs, err = d.Cmd("node", "ls") + if err != nil && retry < 5 { + if strings.Contains(err.Error(), "swarm does not have a leader") { + retry++ + time.Sleep(3 * time.Second) + continue + } + } + assert.NilError(c, err) + c.Assert(outs, checker.Not(checker.Contains), "Swarm is encrypted and needs to be unlocked") + break + } unlockKey = newUnlockKey } @@ -1383,9 +1395,21 @@ func (s *DockerSwarmSuite) TestSwarmClusterRotateUnlockKey(c *check.C) { c.Assert(getNodeStatus(c, d), checker.Equals, swarm.LocalNodeStateActive) - outs, err = d.Cmd("node", "ls") - c.Assert(err, checker.IsNil, check.Commentf("%s", outs)) - c.Assert(outs, checker.Not(checker.Contains), "Swarm is encrypted and needs to be unlocked") + retry := 0 + for { + // an issue sometimes prevents leader to be available right away + outs, err = d.Cmd("node", "ls") + if err != nil && retry < 5 { + if strings.Contains(err.Error(), "swarm does not have a leader") { + retry++ + time.Sleep(3 * time.Second) + continue + } + } + c.Assert(err, checker.IsNil, check.Commentf("%s", outs)) + c.Assert(outs, checker.Not(checker.Contains), "Swarm is encrypted and needs to be unlocked") + break + } } unlockKey = newUnlockKey From b5103adf0fa531e810ebe69ac82c1ba84b93db27 Mon Sep 17 00:00:00 2001 From: Drew Erny Date: Fri, 26 Jul 2019 13:05:00 -0500 Subject: [PATCH 3/4] Fix TestSwarmClusterRotateUnlockKey TestSwarmClusterRotateUnlockKey had been identified as a flaky test. It turns out that the test code was wrong: where we should have been checking the string output of a command, we were instead checking the value of the error. This means that the error case we were expecting was not being matched, and the test was failing when it should have just retried. Signed-off-by: Drew Erny (cherry picked from commit b79adac339173bf8bb6de6d0a061a97973c4b62b) Signed-off-by: Sebastiaan van Stijn --- integration-cli/docker_cli_swarm_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration-cli/docker_cli_swarm_test.go b/integration-cli/docker_cli_swarm_test.go index ca6d03ee0915a..b5a4d31d2fe6d 100644 --- a/integration-cli/docker_cli_swarm_test.go +++ b/integration-cli/docker_cli_swarm_test.go @@ -1308,7 +1308,7 @@ func (s *DockerSwarmSuite) TestSwarmRotateUnlockKey(c *check.C) { // an issue sometimes prevents leader to be available right away outs, err = d.Cmd("node", "ls") if err != nil && retry < 5 { - if strings.Contains(err.Error(), "swarm does not have a leader") { + if strings.Contains(outs, "swarm does not have a leader") { retry++ time.Sleep(3 * time.Second) continue @@ -1400,7 +1400,7 @@ func (s *DockerSwarmSuite) TestSwarmClusterRotateUnlockKey(c *check.C) { // an issue sometimes prevents leader to be available right away outs, err = d.Cmd("node", "ls") if err != nil && retry < 5 { - if strings.Contains(err.Error(), "swarm does not have a leader") { + if strings.Contains(outs, "swarm does not have a leader") { retry++ time.Sleep(3 * time.Second) continue From e06f07ef337ab890f211397d6b408b75a2512dc5 Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Thu, 3 Oct 2019 00:20:46 +0200 Subject: [PATCH 4/4] [18.09] bump SwarmKit 5c86095cef3ff480e69486da50f18fd1b3a0de78 (bump_v18.09) full diff: https://github.com/docker/swarmkit/compare/142a73731c850daf24d32001aa2358b6ffe36eab...5c86095cef3ff480e69486da50f18fd1b3a0de78 - docker/swarmkit#2892 [18.09 backport] Remove hardcoded IPAM config subnet value for ingress network - backport of docker/swarmkit#2890 Remove hardcoded IPAM config subnet value for ingress network - fixes [ENGORC-2651](https://docker.atlassian.net/browse/ENGORC-2651) - docker/swarmkit#2836 [18.09 backport] Switch to go 1.11 - backport of docker/swarmkit#2752 Switch to go 1.11 - docker/swarmkit#2901 [18.09 backport] Bump to golang 1.12.9 - backport of docker/swarmkit#2880 Bump to golang 1.12.9 - docker/swarmkit#2900 [18.09 backport] Fix update out of sequence and increase max recv gRPC message size for nodes and secrets - backport of docker/swarmkit#2762 Increased wait time on test utils WaitForCluster and WatchTaskCreate - backport of docker/swarmkit#2771 Allow using Configs as CredentialSpecs - **second commit only** (attempt to fix weirdly broken tests) - backport of docker/swarmkit#2808 Fix flaky tests - backport of docker/swarmkit#2866 Swap gometalinter for golangci-lint - backport of docker/swarmkit#2869 Increase max recv gRPC message size to initialize connection broker - related / similar to moby/moby#38103 / https://github.com/docker/engine/pull/102 cluster: set bigger grpc limit for array requests - related / similar to moby/moby#39306 Increase max recv gRPC message size for nodes and secrets - fixes docker/swarmkit#2733 Error generated when messages size is too big - backport of docker/swarmkit#2870 Fix update out of sequence Signed-off-by: Sebastiaan van Stijn --- vendor.conf | 2 +- vendor/github.com/docker/swarmkit/agent/agent.go | 2 +- vendor/github.com/docker/swarmkit/agent/session.go | 6 ++++-- vendor/github.com/docker/swarmkit/agent/worker.go | 12 +++++------- .../github.com/docker/swarmkit/api/dispatcher.pb.go | 6 +++--- .../github.com/docker/swarmkit/api/logbroker.pb.go | 6 +++--- vendor/github.com/docker/swarmkit/api/raft.pb.go | 2 +- vendor/github.com/docker/swarmkit/api/types.pb.go | 2 +- .../swarmkit/manager/dispatcher/dispatcher.go | 2 +- .../docker/swarmkit/manager/drivers/provider.go | 2 +- .../github.com/docker/swarmkit/manager/manager.go | 10 ++++------ .../manager/orchestrator/restart/restart.go | 9 +-------- .../docker/swarmkit/manager/orchestrator/service.go | 13 +++++++++---- .../swarmkit/manager/orchestrator/update/updater.go | 5 ++++- .../docker/swarmkit/manager/scheduler/nodeinfo.go | 4 ++-- vendor/github.com/docker/swarmkit/node/node.go | 2 ++ 16 files changed, 43 insertions(+), 42 deletions(-) diff --git a/vendor.conf b/vendor.conf index 165cc5c42be6f..0464839dfc5a1 100644 --- a/vendor.conf +++ b/vendor.conf @@ -130,7 +130,7 @@ github.com/containerd/ttrpc 2a805f71863501300ae1976d29f0454ae003e85a github.com/gogo/googleapis 08a7655d27152912db7aaf4f983275eaf8d128ef # cluster -github.com/docker/swarmkit 142a73731c850daf24d32001aa2358b6ffe36eab # bump_v18.09 branch +github.com/docker/swarmkit 5c86095cef3ff480e69486da50f18fd1b3a0de78 # bump_v18.09 branch github.com/gogo/protobuf v1.0.0 github.com/cloudflare/cfssl 1.3.2 github.com/fernet/fernet-go 1b2437bc582b3cfbb341ee5a29f8ef5b42912ff2 diff --git a/vendor/github.com/docker/swarmkit/agent/agent.go b/vendor/github.com/docker/swarmkit/agent/agent.go index 743072f9dafea..58ebff5934ff0 100644 --- a/vendor/github.com/docker/swarmkit/agent/agent.go +++ b/vendor/github.com/docker/swarmkit/agent/agent.go @@ -575,7 +575,7 @@ func (a *Agent) nodeDescriptionWithHostname(ctx context.Context, tlsInfo *api.No // Override hostname and TLS info if desc != nil { - if a.config.Hostname != "" && desc != nil { + if a.config.Hostname != "" { desc.Hostname = a.config.Hostname } desc.TLSInfo = tlsInfo diff --git a/vendor/github.com/docker/swarmkit/agent/session.go b/vendor/github.com/docker/swarmkit/agent/session.go index 526953509b5bc..2e7f1b6a37081 100644 --- a/vendor/github.com/docker/swarmkit/agent/session.go +++ b/vendor/github.com/docker/swarmkit/agent/session.go @@ -3,6 +3,7 @@ package agent import ( "context" "errors" + "math" "sync" "time" @@ -64,6 +65,7 @@ func newSession(ctx context.Context, agent *Agent, delay time.Duration, sessionI cc, err := agent.config.ConnBroker.Select( grpc.WithTransportCredentials(agent.config.Credentials), grpc.WithTimeout(dispatcherRPCTimeout), + grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(math.MaxInt32)), ) if err != nil { @@ -136,7 +138,7 @@ func (s *session) start(ctx context.Context, description *api.NodeDescription) e // `ctx` is done and hence fail to propagate the timeout error to the agent. // If the error is not propogated to the agent, the agent will not close // the session or rebuild a new session. - sessionCtx, cancelSession := context.WithCancel(ctx) // nolint: vet + sessionCtx, cancelSession := context.WithCancel(ctx) //nolint:govet // Need to run Session in a goroutine since there's no way to set a // timeout for an individual Recv call in a stream. @@ -159,7 +161,7 @@ func (s *session) start(ctx context.Context, description *api.NodeDescription) e select { case err := <-errChan: if err != nil { - return err // nolint: vet + return err //nolint:govet } case <-time.After(dispatcherRPCTimeout): cancelSession() diff --git a/vendor/github.com/docker/swarmkit/agent/worker.go b/vendor/github.com/docker/swarmkit/agent/worker.go index efe538afa767f..212aa1e12f4ec 100644 --- a/vendor/github.com/docker/swarmkit/agent/worker.go +++ b/vendor/github.com/docker/swarmkit/agent/worker.go @@ -257,13 +257,11 @@ func reconcileTaskState(ctx context.Context, w *worker, assignments []*api.Assig } closeManager := func(tm *taskManager) { - go func(tm *taskManager) { - defer w.closers.Done() - // when a task is no longer assigned, we shutdown the task manager - if err := tm.Close(); err != nil { - log.G(ctx).WithError(err).Error("error closing task manager") - } - }(tm) + defer w.closers.Done() + // when a task is no longer assigned, we shutdown the task manager + if err := tm.Close(); err != nil { + log.G(ctx).WithError(err).Error("error closing task manager") + } // make an attempt at removing. this is best effort. any errors will be // retried by the reaper later. diff --git a/vendor/github.com/docker/swarmkit/api/dispatcher.pb.go b/vendor/github.com/docker/swarmkit/api/dispatcher.pb.go index cc443848deefc..f72d3d9995b5c 100644 --- a/vendor/github.com/docker/swarmkit/api/dispatcher.pb.go +++ b/vendor/github.com/docker/swarmkit/api/dispatcher.pb.go @@ -1664,7 +1664,7 @@ func (p *raftProxyDispatcherServer) Session(r *SessionRequest, stream Dispatcher } streamWrapper := Dispatcher_SessionServerWrapper{ Dispatcher_SessionServer: stream, - ctx: ctx, + ctx: ctx, } return p.local.Session(r, streamWrapper) } @@ -1785,7 +1785,7 @@ func (p *raftProxyDispatcherServer) Tasks(r *TasksRequest, stream Dispatcher_Tas } streamWrapper := Dispatcher_TasksServerWrapper{ Dispatcher_TasksServer: stream, - ctx: ctx, + ctx: ctx, } return p.local.Tasks(r, streamWrapper) } @@ -1836,7 +1836,7 @@ func (p *raftProxyDispatcherServer) Assignments(r *AssignmentsRequest, stream Di } streamWrapper := Dispatcher_AssignmentsServerWrapper{ Dispatcher_AssignmentsServer: stream, - ctx: ctx, + ctx: ctx, } return p.local.Assignments(r, streamWrapper) } diff --git a/vendor/github.com/docker/swarmkit/api/logbroker.pb.go b/vendor/github.com/docker/swarmkit/api/logbroker.pb.go index b6231e941cf5a..5456c85816dc3 100644 --- a/vendor/github.com/docker/swarmkit/api/logbroker.pb.go +++ b/vendor/github.com/docker/swarmkit/api/logbroker.pb.go @@ -1335,7 +1335,7 @@ func (p *raftProxyLogsServer) SubscribeLogs(r *SubscribeLogsRequest, stream Logs } streamWrapper := Logs_SubscribeLogsServerWrapper{ Logs_SubscribeLogsServer: stream, - ctx: ctx, + ctx: ctx, } return p.local.SubscribeLogs(r, streamWrapper) } @@ -1458,7 +1458,7 @@ func (p *raftProxyLogBrokerServer) ListenSubscriptions(r *ListenSubscriptionsReq } streamWrapper := LogBroker_ListenSubscriptionsServerWrapper{ LogBroker_ListenSubscriptionsServer: stream, - ctx: ctx, + ctx: ctx, } return p.local.ListenSubscriptions(r, streamWrapper) } @@ -1509,7 +1509,7 @@ func (p *raftProxyLogBrokerServer) PublishLogs(stream LogBroker_PublishLogsServe } streamWrapper := LogBroker_PublishLogsServerWrapper{ LogBroker_PublishLogsServer: stream, - ctx: ctx, + ctx: ctx, } return p.local.PublishLogs(streamWrapper) } diff --git a/vendor/github.com/docker/swarmkit/api/raft.pb.go b/vendor/github.com/docker/swarmkit/api/raft.pb.go index 058b29450c42a..a32a6001b92fd 100644 --- a/vendor/github.com/docker/swarmkit/api/raft.pb.go +++ b/vendor/github.com/docker/swarmkit/api/raft.pb.go @@ -1746,7 +1746,7 @@ func (p *raftProxyRaftServer) StreamRaftMessage(stream Raft_StreamRaftMessageSer } streamWrapper := Raft_StreamRaftMessageServerWrapper{ Raft_StreamRaftMessageServer: stream, - ctx: ctx, + ctx: ctx, } return p.local.StreamRaftMessage(streamWrapper) } diff --git a/vendor/github.com/docker/swarmkit/api/types.pb.go b/vendor/github.com/docker/swarmkit/api/types.pb.go index f41d5e2033735..f5843467cb405 100644 --- a/vendor/github.com/docker/swarmkit/api/types.pb.go +++ b/vendor/github.com/docker/swarmkit/api/types.pb.go @@ -594,7 +594,7 @@ var MaybeEncryptedRecord_Algorithm_name = map[int32]string{ 2: "FERNET_AES_128_CBC", } var MaybeEncryptedRecord_Algorithm_value = map[string]int32{ - "NONE": 0, + "NONE": 0, "SECRETBOX_SALSA20_POLY1305": 1, "FERNET_AES_128_CBC": 2, } diff --git a/vendor/github.com/docker/swarmkit/manager/dispatcher/dispatcher.go b/vendor/github.com/docker/swarmkit/manager/dispatcher/dispatcher.go index 61498064702db..d1db2fdc83c16 100644 --- a/vendor/github.com/docker/swarmkit/manager/dispatcher/dispatcher.go +++ b/vendor/github.com/docker/swarmkit/manager/dispatcher/dispatcher.go @@ -238,7 +238,7 @@ func (d *Dispatcher) Run(ctx context.Context) error { if err != nil { return err } - if err == nil && len(clusters) == 1 { + if len(clusters) == 1 { heartbeatPeriod, err := gogotypes.DurationFromProto(clusters[0].Spec.Dispatcher.HeartbeatPeriod) if err == nil && heartbeatPeriod > 0 { d.config.HeartbeatPeriod = heartbeatPeriod diff --git a/vendor/github.com/docker/swarmkit/manager/drivers/provider.go b/vendor/github.com/docker/swarmkit/manager/drivers/provider.go index 0d9be6119d1a7..97c36fe73d96b 100644 --- a/vendor/github.com/docker/swarmkit/manager/drivers/provider.go +++ b/vendor/github.com/docker/swarmkit/manager/drivers/provider.go @@ -22,7 +22,7 @@ func (m *DriverProvider) NewSecretDriver(driver *api.Driver) (*SecretDriver, err if m.pluginGetter == nil { return nil, fmt.Errorf("plugin getter is nil") } - if driver == nil && driver.Name == "" { + if driver == nil || driver.Name == "" { return nil, fmt.Errorf("driver specification is nil") } // Search for the specified plugin diff --git a/vendor/github.com/docker/swarmkit/manager/manager.go b/vendor/github.com/docker/swarmkit/manager/manager.go index 08b158db441c9..ba7b00566563e 100644 --- a/vendor/github.com/docker/swarmkit/manager/manager.go +++ b/vendor/github.com/docker/swarmkit/manager/manager.go @@ -4,6 +4,7 @@ import ( "context" "crypto/tls" "fmt" + "math" "net" "os" "path/filepath" @@ -758,6 +759,7 @@ func (m *Manager) updateKEK(ctx context.Context, cluster *api.Cluster) error { func(addr string, timeout time.Duration) (net.Conn, error) { return xnet.DialTimeoutLocal(addr, timeout) }), + grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(math.MaxInt32)), ) if err != nil { logger.WithError(err).Error("failed to connect to local manager socket after locking the cluster") @@ -1202,12 +1204,8 @@ func newIngressNetwork() *api.Network { }, DriverConfig: &api.Driver{}, IPAM: &api.IPAMOptions{ - Driver: &api.Driver{}, - Configs: []*api.IPAMConfig{ - { - Subnet: "10.255.0.0/16", - }, - }, + Driver: &api.Driver{}, + Configs: []*api.IPAMConfig{}, }, }, } diff --git a/vendor/github.com/docker/swarmkit/manager/orchestrator/restart/restart.go b/vendor/github.com/docker/swarmkit/manager/orchestrator/restart/restart.go index c034183ba290f..c79d02d98f92a 100644 --- a/vendor/github.com/docker/swarmkit/manager/orchestrator/restart/restart.go +++ b/vendor/github.com/docker/swarmkit/manager/orchestrator/restart/restart.go @@ -508,20 +508,13 @@ func (r *Supervisor) Cancel(taskID string) { <-delay.doneCh } -// CancelAll aborts all pending restarts and waits for any instances of -// StartNow that have already triggered to complete. +// CancelAll aborts all pending restarts func (r *Supervisor) CancelAll() { - var cancelled []delayedStart - r.mu.Lock() for _, delay := range r.delays { delay.cancel() } r.mu.Unlock() - - for _, delay := range cancelled { - <-delay.doneCh - } } // ClearServiceHistory forgets restart history related to a given service ID. diff --git a/vendor/github.com/docker/swarmkit/manager/orchestrator/service.go b/vendor/github.com/docker/swarmkit/manager/orchestrator/service.go index 037e493b3026a..c5d298c516ced 100644 --- a/vendor/github.com/docker/swarmkit/manager/orchestrator/service.go +++ b/vendor/github.com/docker/swarmkit/manager/orchestrator/service.go @@ -47,22 +47,27 @@ func SetServiceTasksRemove(ctx context.Context, s *store.MemoryStore, service *a err = s.Batch(func(batch *store.Batch) error { for _, t := range tasks { err := batch.Update(func(tx store.Tx) error { + // the task may have changed for some reason in the meantime + // since we read it out, so we need to get from the store again + // within the boundaries of a transaction + latestTask := store.GetTask(tx, t.ID) + // time travel is not allowed. if the current desired state is // above the one we're trying to go to we can't go backwards. // we have nothing to do and we should skip to the next task - if t.DesiredState > api.TaskStateRemove { + if latestTask.DesiredState > api.TaskStateRemove { // log a warning, though. we shouln't be trying to rewrite // a state to an earlier state log.G(ctx).Warnf( "cannot update task %v in desired state %v to an earlier desired state %v", - t.ID, t.DesiredState, api.TaskStateRemove, + latestTask.ID, latestTask.DesiredState, api.TaskStateRemove, ) return nil } // update desired state to REMOVE - t.DesiredState = api.TaskStateRemove + latestTask.DesiredState = api.TaskStateRemove - if err := store.UpdateTask(tx, t); err != nil { + if err := store.UpdateTask(tx, latestTask); err != nil { log.G(ctx).WithError(err).Errorf("failed transaction: update task desired state to REMOVE") } return nil diff --git a/vendor/github.com/docker/swarmkit/manager/orchestrator/update/updater.go b/vendor/github.com/docker/swarmkit/manager/orchestrator/update/updater.go index 7c977dba1c4b3..4e6a2cc0bd90e 100644 --- a/vendor/github.com/docker/swarmkit/manager/orchestrator/update/updater.go +++ b/vendor/github.com/docker/swarmkit/manager/orchestrator/update/updater.go @@ -501,7 +501,10 @@ func (u *Updater) removeOldTasks(ctx context.Context, batch *store.Batch, remove return fmt.Errorf("task %s not found while trying to shut it down", original.ID) } if t.DesiredState > api.TaskStateRunning { - return fmt.Errorf("task %s was already shut down when reached by updater", original.ID) + return fmt.Errorf( + "task %s was already shut down when reached by updater (state: %v)", + original.ID, t.DesiredState, + ) } t.DesiredState = api.TaskStateShutdown return store.UpdateTask(tx, t) diff --git a/vendor/github.com/docker/swarmkit/manager/scheduler/nodeinfo.go b/vendor/github.com/docker/swarmkit/manager/scheduler/nodeinfo.go index 3094402a586cb..8f1ad331b5ada 100644 --- a/vendor/github.com/docker/swarmkit/manager/scheduler/nodeinfo.go +++ b/vendor/github.com/docker/swarmkit/manager/scheduler/nodeinfo.go @@ -45,8 +45,8 @@ type NodeInfo struct { func newNodeInfo(n *api.Node, tasks map[string]*api.Task, availableResources api.Resources) NodeInfo { nodeInfo := NodeInfo{ - Node: n, - Tasks: make(map[string]*api.Task), + Node: n, + Tasks: make(map[string]*api.Task), ActiveTasksCountByService: make(map[string]int), AvailableResources: availableResources.Copy(), usedHostPorts: make(map[hostPortSpec]struct{}), diff --git a/vendor/github.com/docker/swarmkit/node/node.go b/vendor/github.com/docker/swarmkit/node/node.go index 92bd74884e9f2..1d3e6c4f50ab0 100644 --- a/vendor/github.com/docker/swarmkit/node/node.go +++ b/vendor/github.com/docker/swarmkit/node/node.go @@ -6,6 +6,7 @@ import ( "crypto/tls" "encoding/json" "io/ioutil" + "math" "net" "os" "path/filepath" @@ -896,6 +897,7 @@ func (n *Node) initManagerConnection(ctx context.Context, ready chan<- struct{}) opts := []grpc.DialOption{ grpc.WithUnaryInterceptor(grpc_prometheus.UnaryClientInterceptor), grpc.WithStreamInterceptor(grpc_prometheus.StreamClientInterceptor), + grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(math.MaxInt32)), } insecureCreds := credentials.NewTLS(&tls.Config{InsecureSkipVerify: true}) opts = append(opts, grpc.WithTransportCredentials(insecureCreds))