noderunner.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318
  1. package cluster
  2. import (
  3. "fmt"
  4. "path/filepath"
  5. "runtime"
  6. "strings"
  7. "sync"
  8. "time"
  9. "github.com/Sirupsen/logrus"
  10. types "github.com/docker/docker/api/types/swarm"
  11. "github.com/docker/docker/daemon/cluster/executor/container"
  12. lncluster "github.com/docker/libnetwork/cluster"
  13. swarmapi "github.com/docker/swarmkit/api"
  14. swarmnode "github.com/docker/swarmkit/node"
  15. "github.com/pkg/errors"
  16. "golang.org/x/net/context"
  17. "google.golang.org/grpc"
  18. )
  19. // nodeRunner implements a manager for continuously running swarmkit node, restarting them with backoff delays if needed.
  20. type nodeRunner struct {
  21. nodeState
  22. mu sync.RWMutex
  23. done chan struct{} // closed when swarmNode exits
  24. ready chan struct{} // closed when swarmNode becomes active
  25. reconnectDelay time.Duration
  26. config nodeStartConfig
  27. repeatedRun bool
  28. cancelReconnect func()
  29. stopping bool
  30. cluster *Cluster // only for accessing config helpers, never call any methods. TODO: change to config struct
  31. }
  32. // nodeStartConfig holds configuration needed to start a new node. Exported
  33. // fields of this structure are saved to disk in json. Unexported fields
  34. // contain data that shouldn't be persisted between daemon reloads.
  35. type nodeStartConfig struct {
  36. // LocalAddr is this machine's local IP or hostname, if specified.
  37. LocalAddr string
  38. // RemoteAddr is the address that was given to "swarm join". It is used
  39. // to find LocalAddr if necessary.
  40. RemoteAddr string
  41. // ListenAddr is the address we bind to, including a port.
  42. ListenAddr string
  43. // AdvertiseAddr is the address other nodes should connect to,
  44. // including a port.
  45. AdvertiseAddr string
  46. // DataPathAddr is the address that has to be used for the data path
  47. DataPathAddr string
  48. joinAddr string
  49. forceNewCluster bool
  50. joinToken string
  51. lockKey []byte
  52. autolock bool
  53. availability types.NodeAvailability
  54. }
  55. func (n *nodeRunner) Ready() chan error {
  56. c := make(chan error, 1)
  57. n.mu.RLock()
  58. ready, done := n.ready, n.done
  59. n.mu.RUnlock()
  60. go func() {
  61. select {
  62. case <-ready:
  63. case <-done:
  64. }
  65. select {
  66. case <-ready:
  67. default:
  68. n.mu.RLock()
  69. c <- n.err
  70. n.mu.RUnlock()
  71. }
  72. close(c)
  73. }()
  74. return c
  75. }
  76. func (n *nodeRunner) Start(conf nodeStartConfig) error {
  77. n.mu.Lock()
  78. defer n.mu.Unlock()
  79. n.reconnectDelay = initialReconnectDelay
  80. return n.start(conf)
  81. }
  82. func (n *nodeRunner) start(conf nodeStartConfig) error {
  83. var control string
  84. if runtime.GOOS == "windows" {
  85. control = `\\.\pipe\` + controlSocket
  86. } else {
  87. control = filepath.Join(n.cluster.runtimeRoot, controlSocket)
  88. }
  89. // Hostname is not set here. Instead, it is obtained from
  90. // the node description that is reported periodically
  91. swarmnodeConfig := swarmnode.Config{
  92. ForceNewCluster: conf.forceNewCluster,
  93. ListenControlAPI: control,
  94. ListenRemoteAPI: conf.ListenAddr,
  95. AdvertiseRemoteAPI: conf.AdvertiseAddr,
  96. JoinAddr: conf.joinAddr,
  97. StateDir: n.cluster.root,
  98. JoinToken: conf.joinToken,
  99. Executor: container.NewExecutor(n.cluster.config.Backend),
  100. HeartbeatTick: 1,
  101. ElectionTick: 3,
  102. UnlockKey: conf.lockKey,
  103. AutoLockManagers: conf.autolock,
  104. PluginGetter: n.cluster.config.Backend.PluginGetter(),
  105. }
  106. if conf.availability != "" {
  107. avail, ok := swarmapi.NodeSpec_Availability_value[strings.ToUpper(string(conf.availability))]
  108. if !ok {
  109. return fmt.Errorf("invalid Availability: %q", conf.availability)
  110. }
  111. swarmnodeConfig.Availability = swarmapi.NodeSpec_Availability(avail)
  112. }
  113. node, err := swarmnode.New(&swarmnodeConfig)
  114. if err != nil {
  115. return err
  116. }
  117. if err := node.Start(context.Background()); err != nil {
  118. return err
  119. }
  120. n.done = make(chan struct{})
  121. n.ready = make(chan struct{})
  122. n.swarmNode = node
  123. n.config = conf
  124. savePersistentState(n.cluster.root, conf)
  125. ctx, cancel := context.WithCancel(context.Background())
  126. go func() {
  127. n.handleNodeExit(node)
  128. cancel()
  129. }()
  130. go n.handleReadyEvent(ctx, node, n.ready)
  131. go n.handleControlSocketChange(ctx, node)
  132. return nil
  133. }
  134. func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmnode.Node) {
  135. for conn := range node.ListenControlSocket(ctx) {
  136. n.mu.Lock()
  137. if n.grpcConn != conn {
  138. if conn == nil {
  139. n.controlClient = nil
  140. n.logsClient = nil
  141. } else {
  142. n.controlClient = swarmapi.NewControlClient(conn)
  143. n.logsClient = swarmapi.NewLogsClient(conn)
  144. }
  145. }
  146. n.grpcConn = conn
  147. n.mu.Unlock()
  148. n.cluster.SendClusterEvent(lncluster.EventSocketChange)
  149. }
  150. }
  151. func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node, ready chan struct{}) {
  152. select {
  153. case <-node.Ready():
  154. n.mu.Lock()
  155. n.err = nil
  156. n.mu.Unlock()
  157. close(ready)
  158. case <-ctx.Done():
  159. }
  160. n.cluster.SendClusterEvent(lncluster.EventNodeReady)
  161. }
  162. func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) {
  163. err := detectLockedError(node.Err(context.Background()))
  164. if err != nil {
  165. logrus.Errorf("cluster exited with error: %v", err)
  166. }
  167. n.mu.Lock()
  168. n.swarmNode = nil
  169. n.err = err
  170. close(n.done)
  171. select {
  172. case <-n.ready:
  173. n.enableReconnectWatcher()
  174. default:
  175. if n.repeatedRun {
  176. n.enableReconnectWatcher()
  177. }
  178. }
  179. n.repeatedRun = true
  180. n.mu.Unlock()
  181. }
  182. // Stop stops the current swarm node if it is running.
  183. func (n *nodeRunner) Stop() error {
  184. n.mu.Lock()
  185. if n.cancelReconnect != nil { // between restarts
  186. n.cancelReconnect()
  187. n.cancelReconnect = nil
  188. }
  189. if n.swarmNode == nil {
  190. n.mu.Unlock()
  191. return nil
  192. }
  193. n.stopping = true
  194. ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
  195. defer cancel()
  196. n.mu.Unlock()
  197. if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
  198. return err
  199. }
  200. n.cluster.SendClusterEvent(lncluster.EventNodeLeave)
  201. <-n.done
  202. return nil
  203. }
  204. func (n *nodeRunner) State() nodeState {
  205. if n == nil {
  206. return nodeState{status: types.LocalNodeStateInactive}
  207. }
  208. n.mu.RLock()
  209. defer n.mu.RUnlock()
  210. ns := n.nodeState
  211. if ns.err != nil || n.cancelReconnect != nil {
  212. if errors.Cause(ns.err) == errSwarmLocked {
  213. ns.status = types.LocalNodeStateLocked
  214. } else {
  215. ns.status = types.LocalNodeStateError
  216. }
  217. } else {
  218. select {
  219. case <-n.ready:
  220. ns.status = types.LocalNodeStateActive
  221. default:
  222. ns.status = types.LocalNodeStatePending
  223. }
  224. }
  225. return ns
  226. }
  227. func (n *nodeRunner) enableReconnectWatcher() {
  228. if n.stopping {
  229. return
  230. }
  231. n.reconnectDelay *= 2
  232. if n.reconnectDelay > maxReconnectDelay {
  233. n.reconnectDelay = maxReconnectDelay
  234. }
  235. logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds())
  236. delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay)
  237. n.cancelReconnect = cancel
  238. config := n.config
  239. go func() {
  240. <-delayCtx.Done()
  241. if delayCtx.Err() != context.DeadlineExceeded {
  242. return
  243. }
  244. n.mu.Lock()
  245. defer n.mu.Unlock()
  246. if n.stopping {
  247. return
  248. }
  249. remotes := n.cluster.getRemoteAddressList()
  250. if len(remotes) > 0 {
  251. config.RemoteAddr = remotes[0]
  252. } else {
  253. config.RemoteAddr = ""
  254. }
  255. config.joinAddr = config.RemoteAddr
  256. if err := n.start(config); err != nil {
  257. n.err = err
  258. }
  259. }()
  260. }
  261. // nodeState represents information about the current state of the cluster and
  262. // provides access to the grpc clients.
  263. type nodeState struct {
  264. swarmNode *swarmnode.Node
  265. grpcConn *grpc.ClientConn
  266. controlClient swarmapi.ControlClient
  267. logsClient swarmapi.LogsClient
  268. status types.LocalNodeState
  269. actualLocalAddr string
  270. err error
  271. }
  272. // IsActiveManager returns true if node is a manager ready to accept control requests. It is safe to access the client properties if this returns true.
  273. func (ns nodeState) IsActiveManager() bool {
  274. return ns.controlClient != nil
  275. }
  276. // IsManager returns true if node is a manager.
  277. func (ns nodeState) IsManager() bool {
  278. return ns.swarmNode != nil && ns.swarmNode.Manager() != nil
  279. }
  280. // NodeID returns node's ID or empty string if node is inactive.
  281. func (ns nodeState) NodeID() string {
  282. if ns.swarmNode != nil {
  283. return ns.swarmNode.NodeID()
  284. }
  285. return ""
  286. }