Browse Source

Merge pull request #32449 from aaronlehmann/cluster-locking

cluster: Allow reentrant calls to methods during shutdown
Tõnis Tiigi 8 years ago
parent
commit
01c80435c6
3 changed files with 17 additions and 11 deletions
  1. 6 1
      daemon/cluster/cluster.go
  2. 1 2
      daemon/cluster/noderunner.go
  3. 10 8
      daemon/cluster/swarm.go

+ 6 - 1
daemon/cluster/cluster.go

@@ -334,8 +334,9 @@ func (c *Cluster) Cleanup() {
 		c.mu.Unlock()
 		return
 	}
-	defer c.mu.Unlock()
 	state := c.currentNodeState()
+	c.mu.Unlock()
+
 	if state.IsActiveManager() {
 		active, reachable, unreachable, err := managerStats(state.controlClient, state.NodeID())
 		if err == nil {
@@ -345,11 +346,15 @@ func (c *Cluster) Cleanup() {
 			}
 		}
 	}
+
 	if err := node.Stop(); err != nil {
 		logrus.Errorf("failed to shut down cluster node: %v", err)
 		signal.DumpStacks("")
 	}
+
+	c.mu.Lock()
 	c.nr = nil
+	c.mu.Unlock()
 }
 
 func managerStats(client swarmapi.ControlClient, currentNodeID string) (current bool, reachable int, unreachable int, err error) {

+ 1 - 2
daemon/cluster/noderunner.go

@@ -210,11 +210,10 @@ func (n *nodeRunner) Stop() error {
 	n.stopping = true
 	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
 	defer cancel()
+	n.mu.Unlock()
 	if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
-		n.mu.Unlock()
 		return err
 	}
-	n.mu.Unlock()
 	<-n.done
 	return nil
 }

+ 10 - 8
daemon/cluster/swarm.go

@@ -25,19 +25,20 @@ import (
 func (c *Cluster) Init(req types.InitRequest) (string, error) {
 	c.controlMutex.Lock()
 	defer c.controlMutex.Unlock()
-	c.mu.Lock()
 	if c.nr != nil {
 		if req.ForceNewCluster {
+			// Take c.mu temporarily to wait for presently running
+			// API handlers to finish before shutting down the node.
+			c.mu.Lock()
+			c.mu.Unlock()
+
 			if err := c.nr.Stop(); err != nil {
-				c.mu.Unlock()
 				return "", err
 			}
 		} else {
-			c.mu.Unlock()
 			return "", errSwarmExists
 		}
 	}
-	c.mu.Unlock()
 
 	if err := validateAndSanitizeInitRequest(&req); err != nil {
 		return "", apierrors.NewBadRequestError(err)
@@ -325,9 +326,10 @@ func (c *Cluster) Leave(force bool) error {
 
 	state := c.currentNodeState()
 
+	c.mu.Unlock()
+
 	if errors.Cause(state.err) == errSwarmLocked && !force {
 		// leave a locked swarm without --force is not allowed
-		c.mu.Unlock()
 		return errors.New("Swarm is encrypted and locked. Please unlock it first or use `--force` to ignore this message.")
 	}
 
@@ -339,7 +341,6 @@ func (c *Cluster) Leave(force bool) error {
 				if active && removingManagerCausesLossOfQuorum(reachable, unreachable) {
 					if isLastManager(reachable, unreachable) {
 						msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. "
-						c.mu.Unlock()
 						return errors.New(msg)
 					}
 					msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable)
@@ -350,18 +351,19 @@ func (c *Cluster) Leave(force bool) error {
 		}
 
 		msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message."
-		c.mu.Unlock()
 		return errors.New(msg)
 	}
 	// release readers in here
 	if err := nr.Stop(); err != nil {
 		logrus.Errorf("failed to shut down cluster node: %v", err)
 		signal.DumpStacks("")
-		c.mu.Unlock()
 		return err
 	}
+
+	c.mu.Lock()
 	c.nr = nil
 	c.mu.Unlock()
+
 	if nodeID := state.NodeID(); nodeID != "" {
 		nodeContainers, err := c.listContainerForNode(nodeID)
 		if err != nil {