From c502db4955f9d04dc782d5bc2b2c22d0630bde4a Mon Sep 17 00:00:00 2001 From: Tonis Tiigi Date: Mon, 15 Jul 2019 17:23:55 -0700 Subject: [PATCH 1/3] integration-cli: allow temporary errors on leader switch Signed-off-by: Tonis Tiigi (cherry picked from commit 3df1095bbdc331d4effa5452d8aafd5aaead5789) Signed-off-by: Sebastiaan van Stijn --- integration-cli/docker_api_swarm_test.go | 16 ++++++++++++++-- internal/test/daemon/node.go | 9 ++++++++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/integration-cli/docker_api_swarm_test.go b/integration-cli/docker_api_swarm_test.go index e7d1b983322fc..7b498d5ea9e6f 100644 --- a/integration-cli/docker_api_swarm_test.go +++ b/integration-cli/docker_api_swarm_test.go @@ -27,6 +27,7 @@ import ( "github.com/docker/docker/internal/test/request" "github.com/docker/swarmkit/ca" "github.com/go-check/check" + "github.com/pkg/errors" "gotest.tools/assert" is "gotest.tools/assert/cmp" ) @@ -313,13 +314,24 @@ func (s *DockerSwarmSuite) TestAPISwarmLeaderElection(c *check.C) { leader *daemon.Daemon // keep track of leader followers []*daemon.Daemon // keep track of followers ) + var lastErr error checkLeader := func(nodes ...*daemon.Daemon) checkF { return func(c *check.C) (interface{}, check.CommentInterface) { // clear these out before each run leader = nil followers = nil for _, d := range nodes { - if d.GetNode(c, d.NodeID()).ManagerStatus.Leader { + n := d.GetNode(c, d.NodeID(), func(err error) bool { + if strings.Contains(errors.Cause(err).Error(), context.DeadlineExceeded.Error()) || strings.Contains(err.Error(), "swarm does not have a leader") { + lastErr = err + return true + } + return false + }) + if n == nil { + return false, check.Commentf("failed to get node: %v", lastErr) + } + if n.ManagerStatus.Leader { leader = d } else { followers = append(followers, d) @@ -391,7 +403,7 @@ func (s *DockerSwarmSuite) TestAPISwarmRaftQuorum(c *check.C) { defer cli.Close() // d1 will eventually step down from leader because there is no longer an active quorum, wait for that to happen - waitAndAssert(c, defaultReconciliationTimeout, func(c *check.C) (interface{}, check.CommentInterface) { + waitAndAssert(c, defaultReconciliationTimeout*2, func(c *check.C) (interface{}, check.CommentInterface) { _, err := cli.ServiceCreate(context.Background(), service.Spec, types.ServiceCreateOptions{}) return err.Error(), nil }, checker.Contains, "Make sure more than half of the managers are online.") diff --git a/internal/test/daemon/node.go b/internal/test/daemon/node.go index 33dd365429531..d5f6f15263fa4 100644 --- a/internal/test/daemon/node.go +++ b/internal/test/daemon/node.go @@ -15,7 +15,7 @@ import ( type NodeConstructor func(*swarm.Node) // GetNode returns a swarm node identified by the specified id -func (d *Daemon) GetNode(t assert.TestingT, id string) *swarm.Node { +func (d *Daemon) GetNode(t assert.TestingT, id string, errCheck ...func(error) bool) *swarm.Node { if ht, ok := t.(test.HelperT); ok { ht.Helper() } @@ -23,6 +23,13 @@ func (d *Daemon) GetNode(t assert.TestingT, id string) *swarm.Node { defer cli.Close() node, _, err := cli.NodeInspectWithRaw(context.Background(), id) + if err != nil { + for _, f := range errCheck { + if f(err) { + return nil + } + } + } assert.NilError(t, err, "[%s] (*Daemon).GetNode: NodeInspectWithRaw(%q) failed", d.id, id) assert.Check(t, node.ID == id) return &node From cf05755e9ddb6ee976720ad1a5093aa0200851e6 Mon Sep 17 00:00:00 2001 From: Tonis Tiigi Date: Tue, 16 Jul 2019 14:07:26 -0700 Subject: [PATCH 2/3] integration-cli: allow temporary no-leader error Signed-off-by: Tonis Tiigi (cherry picked from commit 52e0dfef9090fa3c6003115a2c82238b189ebe42) Signed-off-by: Sebastiaan van Stijn --- integration-cli/docker_cli_swarm_test.go | 36 ++++++++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/integration-cli/docker_cli_swarm_test.go b/integration-cli/docker_cli_swarm_test.go index bfc76ea1b9a98..5d1514930a9bc 100644 --- a/integration-cli/docker_cli_swarm_test.go +++ b/integration-cli/docker_cli_swarm_test.go @@ -1303,9 +1303,21 @@ func (s *DockerSwarmSuite) TestSwarmRotateUnlockKey(c *check.C) { c.Assert(getNodeStatus(c, d), checker.Equals, swarm.LocalNodeStateActive) - outs, err = d.Cmd("node", "ls") - assert.NilError(c, err) - c.Assert(outs, checker.Not(checker.Contains), "Swarm is encrypted and needs to be unlocked") + retry := 0 + for { + // an issue sometimes prevents leader to be available right away + outs, err = d.Cmd("node", "ls") + if err != nil && retry < 5 { + if strings.Contains(err.Error(), "swarm does not have a leader") { + retry++ + time.Sleep(3 * time.Second) + continue + } + } + assert.NilError(c, err) + c.Assert(outs, checker.Not(checker.Contains), "Swarm is encrypted and needs to be unlocked") + break + } unlockKey = newUnlockKey } @@ -1383,9 +1395,21 @@ func (s *DockerSwarmSuite) TestSwarmClusterRotateUnlockKey(c *check.C) { c.Assert(getNodeStatus(c, d), checker.Equals, swarm.LocalNodeStateActive) - outs, err = d.Cmd("node", "ls") - c.Assert(err, checker.IsNil, check.Commentf("%s", outs)) - c.Assert(outs, checker.Not(checker.Contains), "Swarm is encrypted and needs to be unlocked") + retry := 0 + for { + // an issue sometimes prevents leader to be available right away + outs, err = d.Cmd("node", "ls") + if err != nil && retry < 5 { + if strings.Contains(err.Error(), "swarm does not have a leader") { + retry++ + time.Sleep(3 * time.Second) + continue + } + } + c.Assert(err, checker.IsNil, check.Commentf("%s", outs)) + c.Assert(outs, checker.Not(checker.Contains), "Swarm is encrypted and needs to be unlocked") + break + } } unlockKey = newUnlockKey From c93da8ded9ac4efac6ac23af2ac32ab1ac50bb9b Mon Sep 17 00:00:00 2001 From: Drew Erny Date: Fri, 26 Jul 2019 13:05:00 -0500 Subject: [PATCH 3/3] Fix TestSwarmClusterRotateUnlockKey TestSwarmClusterRotateUnlockKey had been identified as a flaky test. It turns out that the test code was wrong: where we should have been checking the string output of a command, we were instead checking the value of the error. This means that the error case we were expecting was not being matched, and the test was failing when it should have just retried. Signed-off-by: Drew Erny (cherry picked from commit b79adac339173bf8bb6de6d0a061a97973c4b62b) Signed-off-by: Sebastiaan van Stijn --- integration-cli/docker_cli_swarm_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/integration-cli/docker_cli_swarm_test.go b/integration-cli/docker_cli_swarm_test.go index 5d1514930a9bc..64d4fade55a8e 100644 --- a/integration-cli/docker_cli_swarm_test.go +++ b/integration-cli/docker_cli_swarm_test.go @@ -1308,7 +1308,7 @@ func (s *DockerSwarmSuite) TestSwarmRotateUnlockKey(c *check.C) { // an issue sometimes prevents leader to be available right away outs, err = d.Cmd("node", "ls") if err != nil && retry < 5 { - if strings.Contains(err.Error(), "swarm does not have a leader") { + if strings.Contains(outs, "swarm does not have a leader") { retry++ time.Sleep(3 * time.Second) continue @@ -1400,7 +1400,7 @@ func (s *DockerSwarmSuite) TestSwarmClusterRotateUnlockKey(c *check.C) { // an issue sometimes prevents leader to be available right away outs, err = d.Cmd("node", "ls") if err != nil && retry < 5 { - if strings.Contains(err.Error(), "swarm does not have a leader") { + if strings.Contains(outs, "swarm does not have a leader") { retry++ time.Sleep(3 * time.Second) continue