Browse Source

Merge pull request #47201 from thaJeztah/25.0_backport_swarm_rotate_key_flake

[25.0 backport] De-flake TestSwarmClusterRotateUnlockKey... again... maybe?
Sebastiaan van Stijn 1 năm trước cách đây
mục cha
commit
a140d0d95f

+ 3 - 0
integration-cli/check_test.go

@@ -657,6 +657,9 @@ func (s *DockerSwarmSuite) TearDownTest(ctx context.Context, c *testing.T) {
 	s.daemonsLock.Lock()
 	s.daemonsLock.Lock()
 	for _, d := range s.daemons {
 	for _, d := range s.daemons {
 		if d != nil {
 		if d != nil {
+			if c.Failed() {
+				d.TailLogsT(c, 100)
+			}
 			d.Stop(c)
 			d.Stop(c)
 			d.Cleanup(c)
 			d.Cleanup(c)
 		}
 		}

+ 54 - 53
integration-cli/docker_cli_swarm_test.go

@@ -27,6 +27,7 @@ import (
 	remoteipam "github.com/docker/docker/libnetwork/ipams/remote/api"
 	remoteipam "github.com/docker/docker/libnetwork/ipams/remote/api"
 	"github.com/docker/docker/pkg/plugins"
 	"github.com/docker/docker/pkg/plugins"
 	"github.com/docker/docker/testutil"
 	"github.com/docker/docker/testutil"
+	testdaemon "github.com/docker/docker/testutil/daemon"
 	"github.com/moby/swarmkit/v2/ca/keyutils"
 	"github.com/moby/swarmkit/v2/ca/keyutils"
 	"github.com/vishvananda/netlink"
 	"github.com/vishvananda/netlink"
 	"gotest.tools/v3/assert"
 	"gotest.tools/v3/assert"
@@ -1260,6 +1261,8 @@ func (s *DockerSwarmSuite) TestSwarmJoinPromoteLocked(c *testing.T) {
 	poll.WaitOn(c, pollCheck(c, d3.CheckLocalNodeState(ctx), checker.Equals(swarm.LocalNodeStateActive)), poll.WithTimeout(time.Second))
 	poll.WaitOn(c, pollCheck(c, d3.CheckLocalNodeState(ctx), checker.Equals(swarm.LocalNodeStateActive)), poll.WithTimeout(time.Second))
 }
 }
 
 
+const swarmIsEncryptedMsg = "Swarm is encrypted and needs to be unlocked"
+
 func (s *DockerSwarmSuite) TestSwarmRotateUnlockKey(c *testing.T) {
 func (s *DockerSwarmSuite) TestSwarmRotateUnlockKey(c *testing.T) {
 	ctx := testutil.GetContext(c)
 	ctx := testutil.GetContext(c)
 	d := s.AddDaemon(ctx, c, true, true)
 	d := s.AddDaemon(ctx, c, true, true)
@@ -1280,12 +1283,16 @@ func (s *DockerSwarmSuite) TestSwarmRotateUnlockKey(c *testing.T) {
 		d.RestartNode(c)
 		d.RestartNode(c)
 		assert.Equal(c, getNodeStatus(c, d), swarm.LocalNodeStateLocked)
 		assert.Equal(c, getNodeStatus(c, d), swarm.LocalNodeStateLocked)
 
 
+		unlock := func(d *daemon.Daemon, key string) *icmd.Result {
+			cmd := d.Command("swarm", "unlock")
+			cmd.Stdin = strings.NewReader(key)
+			return icmd.RunCmd(cmd)
+		}
+
 		outs, _ = d.Cmd("node", "ls")
 		outs, _ = d.Cmd("node", "ls")
-		assert.Assert(c, strings.Contains(outs, "Swarm is encrypted and needs to be unlocked"), outs)
-		cmd := d.Command("swarm", "unlock")
-		cmd.Stdin = bytes.NewBufferString(unlockKey)
-		result := icmd.RunCmd(cmd)
+		assert.Assert(c, strings.Contains(outs, swarmIsEncryptedMsg), outs)
 
 
+		result := unlock(d, unlockKey)
 		if result.Error == nil {
 		if result.Error == nil {
 			// On occasion, the daemon may not have finished
 			// On occasion, the daemon may not have finished
 			// rotating the KEK before restarting. The test is
 			// rotating the KEK before restarting. The test is
@@ -1295,13 +1302,16 @@ func (s *DockerSwarmSuite) TestSwarmRotateUnlockKey(c *testing.T) {
 			// restart again, the new key should be required this
 			// restart again, the new key should be required this
 			// time.
 			// time.
 
 
-			time.Sleep(3 * time.Second)
+			// Wait for the rotation to happen
+			// Since there are multiple rotations, we need to wait until for the number of rotations we are currently on to be reflected in the logs
+			// This is a little janky... its depending on specific log messages AND these are debug logs... but it is the best we can do for now.
+			matcher := testdaemon.ScanLogsMatchCount(testdaemon.ScanLogsMatchString("successfully rotated KEK"), i+1)
+			poll.WaitOn(c, d.PollCheckLogs(ctx, matcher), poll.WithDelay(3*time.Second), poll.WithTimeout(time.Minute))
+			d.Restart(c)
 
 
 			d.RestartNode(c)
 			d.RestartNode(c)
 
 
-			cmd = d.Command("swarm", "unlock")
-			cmd.Stdin = bytes.NewBufferString(unlockKey)
-			result = icmd.RunCmd(cmd)
+			result = unlock(d, unlockKey)
 		}
 		}
 		result.Assert(c, icmd.Expected{
 		result.Assert(c, icmd.Expected{
 			ExitCode: 1,
 			ExitCode: 1,
@@ -1309,28 +1319,20 @@ func (s *DockerSwarmSuite) TestSwarmRotateUnlockKey(c *testing.T) {
 		})
 		})
 
 
 		outs, _ = d.Cmd("node", "ls")
 		outs, _ = d.Cmd("node", "ls")
-		assert.Assert(c, strings.Contains(outs, "Swarm is encrypted and needs to be unlocked"), outs)
-		cmd = d.Command("swarm", "unlock")
-		cmd.Stdin = bytes.NewBufferString(newUnlockKey)
-		icmd.RunCmd(cmd).Assert(c, icmd.Success)
+		assert.Assert(c, strings.Contains(outs, swarmIsEncryptedMsg), outs)
+		unlock(d, newUnlockKey).Assert(c, icmd.Success)
 
 
 		assert.Equal(c, getNodeStatus(c, d), swarm.LocalNodeStateActive)
 		assert.Equal(c, getNodeStatus(c, d), swarm.LocalNodeStateActive)
 
 
-		retry := 0
-		for {
+		checkNodeLs := func(t poll.LogT) poll.Result {
 			// an issue sometimes prevents leader to be available right away
 			// an issue sometimes prevents leader to be available right away
-			outs, err = d.Cmd("node", "ls")
-			if err != nil && retry < 5 {
-				if strings.Contains(outs, "swarm does not have a leader") {
-					retry++
-					time.Sleep(3 * time.Second)
-					continue
-				}
+			out, err := d.Cmd("node", "ls")
+			if err != nil {
+				return poll.Continue("error running node ls: %v: %s", err, out)
 			}
 			}
-			assert.NilError(c, err)
-			assert.Assert(c, !strings.Contains(outs, "Swarm is encrypted and needs to be unlocked"), outs)
-			break
+			return poll.Success()
 		}
 		}
+		poll.WaitOn(c, checkNodeLs, poll.WithDelay(3*time.Second), poll.WithTimeout(time.Minute))
 
 
 		unlockKey = newUnlockKey
 		unlockKey = newUnlockKey
 	}
 	}
@@ -1368,15 +1370,23 @@ func (s *DockerSwarmSuite) TestSwarmClusterRotateUnlockKey(c *testing.T) {
 		d2.RestartNode(c)
 		d2.RestartNode(c)
 		d3.RestartNode(c)
 		d3.RestartNode(c)
 
 
+		unlock := func(d *daemon.Daemon, key string) *icmd.Result {
+			cmd := d.Command("swarm", "unlock")
+			cmd.Stdin = strings.NewReader(key)
+			return icmd.RunCmd(cmd)
+		}
+
+		const swarmIsEncryptedMsg = "Swarm is encrypted and needs to be unlocked"
+
 		for _, d := range []*daemon.Daemon{d2, d3} {
 		for _, d := range []*daemon.Daemon{d2, d3} {
 			assert.Equal(c, getNodeStatus(c, d), swarm.LocalNodeStateLocked)
 			assert.Equal(c, getNodeStatus(c, d), swarm.LocalNodeStateLocked)
 
 
 			outs, _ := d.Cmd("node", "ls")
 			outs, _ := d.Cmd("node", "ls")
-			assert.Assert(c, strings.Contains(outs, "Swarm is encrypted and needs to be unlocked"), outs)
-			cmd := d.Command("swarm", "unlock")
-			cmd.Stdin = bytes.NewBufferString(unlockKey)
-			result := icmd.RunCmd(cmd)
+			assert.Assert(c, strings.Contains(outs, swarmIsEncryptedMsg), outs)
 
 
+			// unlock with the original key should fail
+			// Use poll here because the daemon may not have finished
+			result := unlock(d, unlockKey)
 			if result.Error == nil {
 			if result.Error == nil {
 				// On occasion, the daemon may not have finished
 				// On occasion, the daemon may not have finished
 				// rotating the KEK before restarting. The test is
 				// rotating the KEK before restarting. The test is
@@ -1386,13 +1396,14 @@ func (s *DockerSwarmSuite) TestSwarmClusterRotateUnlockKey(c *testing.T) {
 				// restart again, the new key should be required this
 				// restart again, the new key should be required this
 				// time.
 				// time.
 
 
-				time.Sleep(3 * time.Second)
-
-				d.RestartNode(c)
+				// Wait for the rotation to happen
+				// Since there are multiple rotations, we need to wait until for the number of rotations we are currently on to be reflected in the logs
+				// This is a little janky... its depending on specific log messages AND these are debug logs... but it is the best we can do for now.
+				matcher := testdaemon.ScanLogsMatchCount(testdaemon.ScanLogsMatchString("successfully rotated KEK"), i+1)
+				poll.WaitOn(c, d.PollCheckLogs(ctx, matcher), poll.WithDelay(3*time.Second), poll.WithTimeout(time.Minute))
+				d.Restart(c)
 
 
-				cmd = d.Command("swarm", "unlock")
-				cmd.Stdin = bytes.NewBufferString(unlockKey)
-				result = icmd.RunCmd(cmd)
+				result = unlock(d, unlockKey)
 			}
 			}
 			result.Assert(c, icmd.Expected{
 			result.Assert(c, icmd.Expected{
 				ExitCode: 1,
 				ExitCode: 1,
@@ -1400,31 +1411,21 @@ func (s *DockerSwarmSuite) TestSwarmClusterRotateUnlockKey(c *testing.T) {
 			})
 			})
 
 
 			outs, _ = d.Cmd("node", "ls")
 			outs, _ = d.Cmd("node", "ls")
-			assert.Assert(c, strings.Contains(outs, "Swarm is encrypted and needs to be unlocked"), outs)
-			cmd = d.Command("swarm", "unlock")
-			cmd.Stdin = bytes.NewBufferString(newUnlockKey)
-			icmd.RunCmd(cmd).Assert(c, icmd.Success)
+			assert.Assert(c, strings.Contains(outs, swarmIsEncryptedMsg), outs)
 
 
+			// now unlock with the rotated key, this should succeed
+			unlock(d, newUnlockKey).Assert(c, icmd.Success)
 			assert.Equal(c, getNodeStatus(c, d), swarm.LocalNodeStateActive)
 			assert.Equal(c, getNodeStatus(c, d), swarm.LocalNodeStateActive)
 
 
-			retry := 0
-			for {
+			checkNodeLs := func(t poll.LogT) poll.Result {
 				// an issue sometimes prevents leader to be available right away
 				// an issue sometimes prevents leader to be available right away
-				outs, err = d.Cmd("node", "ls")
-				if err != nil && retry < 5 {
-					if strings.Contains(outs, "swarm does not have a leader") {
-						retry++
-						c.Logf("[%s] got 'swarm does not have a leader'. retrying (attempt %d/5)", d.ID(), retry)
-						time.Sleep(3 * time.Second)
-						continue
-					} else {
-						c.Logf("[%s] gave error: '%v'. retrying (attempt %d/5): %s", d.ID(), err, retry, outs)
-					}
+				out, err := d.Cmd("node", "ls")
+				if err != nil {
+					return poll.Continue("error running node ls: %v: %s", err, out)
 				}
 				}
-				assert.NilError(c, err, "[%s] failed after %d retries: %v (%s)", d.ID(), retry, err, outs)
-				assert.Assert(c, !strings.Contains(outs, "Swarm is encrypted and needs to be unlocked"), outs)
-				break
+				return poll.Success()
 			}
 			}
+			poll.WaitOn(c, checkNodeLs, poll.WithDelay(3*time.Second), poll.WithTimeout(time.Minute))
 		}
 		}
 
 
 		unlockKey = newUnlockKey
 		unlockKey = newUnlockKey

+ 11 - 0
testutil/daemon/daemon.go

@@ -338,6 +338,17 @@ func ScanLogsMatchString(contains string) func(string) bool {
 	}
 	}
 }
 }
 
 
+// ScanLogsMatchCount returns a function that can be used to scan the daemon logs until the passed in matcher function matches `count` times
+func ScanLogsMatchCount(f func(string) bool, count int) func(string) bool {
+	matched := 0
+	return func(line string) bool {
+		if f(line) {
+			matched++
+		}
+		return matched == count
+	}
+}
+
 // ScanLogsMatchAll returns a function that can be used to scan the daemon logs until *all* the passed in strings are matched
 // ScanLogsMatchAll returns a function that can be used to scan the daemon logs until *all* the passed in strings are matched
 func ScanLogsMatchAll(contains ...string) func(string) bool {
 func ScanLogsMatchAll(contains ...string) func(string) bool {
 	matched := make(map[string]bool)
 	matched := make(map[string]bool)