diff --git a/daemon/cluster/noderunner.go b/daemon/cluster/noderunner.go index 2ec13b4639..c0c7529ed9 100644 --- a/daemon/cluster/noderunner.go +++ b/daemon/cluster/noderunner.go @@ -50,6 +50,9 @@ type nodeStartConfig struct { AdvertiseAddr string // DataPathAddr is the address that has to be used for the data path DataPathAddr string + // JoinInProgress is set to true if a join operation has started, but + // not completed yet. + JoinInProgress bool joinAddr string forceNewCluster bool @@ -98,6 +101,13 @@ func (n *nodeRunner) start(conf nodeStartConfig) error { control = filepath.Join(n.cluster.runtimeRoot, controlSocket) } + joinAddr := conf.joinAddr + if joinAddr == "" && conf.JoinInProgress { + // We must have been restarted while trying to join a cluster. + // Continue trying to join instead of forming our own cluster. + joinAddr = conf.RemoteAddr + } + // Hostname is not set here. Instead, it is obtained from // the node description that is reported periodically swarmnodeConfig := swarmnode.Config{ @@ -105,7 +115,7 @@ func (n *nodeRunner) start(conf nodeStartConfig) error { ListenControlAPI: control, ListenRemoteAPI: conf.ListenAddr, AdvertiseRemoteAPI: conf.AdvertiseAddr, - JoinAddr: conf.joinAddr, + JoinAddr: joinAddr, StateDir: n.cluster.root, JoinToken: conf.joinToken, Executor: container.NewExecutor(n.cluster.config.Backend), @@ -133,6 +143,9 @@ func (n *nodeRunner) start(conf nodeStartConfig) error { n.done = make(chan struct{}) n.ready = make(chan struct{}) n.swarmNode = node + if conf.joinAddr != "" { + conf.JoinInProgress = true + } n.config = conf savePersistentState(n.cluster.root, conf) @@ -216,6 +229,10 @@ func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node, case <-node.Ready(): n.mu.Lock() n.err = nil + if n.config.JoinInProgress { + n.config.JoinInProgress = false + savePersistentState(n.cluster.root, n.config) + } n.mu.Unlock() close(ready) case <-ctx.Done(): @@ -306,7 +323,6 @@ func (n *nodeRunner) enableReconnectWatcher() { delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay) n.cancelReconnect = cancel - config := n.config go func() { <-delayCtx.Done() if delayCtx.Err() != context.DeadlineExceeded { @@ -317,15 +333,8 @@ func (n *nodeRunner) enableReconnectWatcher() { if n.stopping { return } - remotes := n.cluster.getRemoteAddressList() - if len(remotes) > 0 { - config.RemoteAddr = remotes[0] - } else { - config.RemoteAddr = "" - } - config.joinAddr = config.RemoteAddr - if err := n.start(config); err != nil { + if err := n.start(n.config); err != nil { n.err = err } }()