noderunner.go 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. package cluster
  2. import (
  3. "fmt"
  4. "path/filepath"
  5. "runtime"
  6. "strings"
  7. "sync"
  8. "time"
  9. "github.com/Sirupsen/logrus"
  10. types "github.com/docker/docker/api/types/swarm"
  11. "github.com/docker/docker/daemon/cluster/executor/container"
  12. swarmapi "github.com/docker/swarmkit/api"
  13. swarmnode "github.com/docker/swarmkit/node"
  14. "github.com/pkg/errors"
  15. "golang.org/x/net/context"
  16. "google.golang.org/grpc"
  17. )
  18. // nodeRunner implements a manager for continuously running swarmkit node, restarting them with backoff delays if needed.
  19. type nodeRunner struct {
  20. nodeState
  21. mu sync.RWMutex
  22. done chan struct{} // closed when swarmNode exits
  23. ready chan struct{} // closed when swarmNode becomes active
  24. reconnectDelay time.Duration
  25. config nodeStartConfig
  26. repeatedRun bool
  27. cancelReconnect func()
  28. stopping bool
  29. cluster *Cluster // only for accessing config helpers, never call any methods. TODO: change to config struct
  30. }
  31. // nodeStartConfig holds configuration needed to start a new node. Exported
  32. // fields of this structure are saved to disk in json. Unexported fields
  33. // contain data that shouldn't be persisted between daemon reloads.
  34. type nodeStartConfig struct {
  35. // LocalAddr is this machine's local IP or hostname, if specified.
  36. LocalAddr string
  37. // RemoteAddr is the address that was given to "swarm join". It is used
  38. // to find LocalAddr if necessary.
  39. RemoteAddr string
  40. // ListenAddr is the address we bind to, including a port.
  41. ListenAddr string
  42. // AdvertiseAddr is the address other nodes should connect to,
  43. // including a port.
  44. AdvertiseAddr string
  45. // DataPathAddr is the address that has to be used for the data path
  46. DataPathAddr string
  47. joinAddr string
  48. forceNewCluster bool
  49. joinToken string
  50. lockKey []byte
  51. autolock bool
  52. availability types.NodeAvailability
  53. }
  54. func (n *nodeRunner) Ready() chan error {
  55. c := make(chan error, 1)
  56. n.mu.RLock()
  57. ready, done := n.ready, n.done
  58. n.mu.RUnlock()
  59. go func() {
  60. select {
  61. case <-ready:
  62. case <-done:
  63. }
  64. select {
  65. case <-ready:
  66. default:
  67. n.mu.RLock()
  68. c <- n.err
  69. n.mu.RUnlock()
  70. }
  71. close(c)
  72. }()
  73. return c
  74. }
  75. func (n *nodeRunner) Start(conf nodeStartConfig) error {
  76. n.mu.Lock()
  77. defer n.mu.Unlock()
  78. n.reconnectDelay = initialReconnectDelay
  79. return n.start(conf)
  80. }
  81. func (n *nodeRunner) start(conf nodeStartConfig) error {
  82. var control string
  83. if runtime.GOOS == "windows" {
  84. control = `\\.\pipe\` + controlSocket
  85. } else {
  86. control = filepath.Join(n.cluster.runtimeRoot, controlSocket)
  87. }
  88. // Hostname is not set here. Instead, it is obtained from
  89. // the node description that is reported periodically
  90. swarmnodeConfig := swarmnode.Config{
  91. ForceNewCluster: conf.forceNewCluster,
  92. ListenControlAPI: control,
  93. ListenRemoteAPI: conf.ListenAddr,
  94. AdvertiseRemoteAPI: conf.AdvertiseAddr,
  95. JoinAddr: conf.joinAddr,
  96. StateDir: n.cluster.root,
  97. JoinToken: conf.joinToken,
  98. Executor: container.NewExecutor(n.cluster.config.Backend),
  99. HeartbeatTick: 1,
  100. ElectionTick: 3,
  101. UnlockKey: conf.lockKey,
  102. AutoLockManagers: conf.autolock,
  103. PluginGetter: n.cluster.config.Backend.PluginGetter(),
  104. }
  105. if conf.availability != "" {
  106. avail, ok := swarmapi.NodeSpec_Availability_value[strings.ToUpper(string(conf.availability))]
  107. if !ok {
  108. return fmt.Errorf("invalid Availability: %q", conf.availability)
  109. }
  110. swarmnodeConfig.Availability = swarmapi.NodeSpec_Availability(avail)
  111. }
  112. node, err := swarmnode.New(&swarmnodeConfig)
  113. if err != nil {
  114. return err
  115. }
  116. if err := node.Start(context.Background()); err != nil {
  117. return err
  118. }
  119. n.done = make(chan struct{})
  120. n.ready = make(chan struct{})
  121. n.swarmNode = node
  122. n.config = conf
  123. savePersistentState(n.cluster.root, conf)
  124. ctx, cancel := context.WithCancel(context.Background())
  125. go func() {
  126. n.handleNodeExit(node)
  127. cancel()
  128. }()
  129. go n.handleReadyEvent(ctx, node, n.ready)
  130. go n.handleControlSocketChange(ctx, node)
  131. return nil
  132. }
  133. func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmnode.Node) {
  134. for conn := range node.ListenControlSocket(ctx) {
  135. n.mu.Lock()
  136. if n.grpcConn != conn {
  137. if conn == nil {
  138. n.controlClient = nil
  139. n.logsClient = nil
  140. } else {
  141. n.controlClient = swarmapi.NewControlClient(conn)
  142. n.logsClient = swarmapi.NewLogsClient(conn)
  143. }
  144. }
  145. n.grpcConn = conn
  146. n.mu.Unlock()
  147. n.cluster.configEvent <- struct{}{}
  148. }
  149. }
  150. func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node, ready chan struct{}) {
  151. select {
  152. case <-node.Ready():
  153. n.mu.Lock()
  154. n.err = nil
  155. n.mu.Unlock()
  156. close(ready)
  157. case <-ctx.Done():
  158. }
  159. n.cluster.configEvent <- struct{}{}
  160. }
  161. func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) {
  162. err := detectLockedError(node.Err(context.Background()))
  163. if err != nil {
  164. logrus.Errorf("cluster exited with error: %v", err)
  165. }
  166. n.mu.Lock()
  167. n.swarmNode = nil
  168. n.err = err
  169. close(n.done)
  170. select {
  171. case <-n.ready:
  172. n.enableReconnectWatcher()
  173. default:
  174. if n.repeatedRun {
  175. n.enableReconnectWatcher()
  176. }
  177. }
  178. n.repeatedRun = true
  179. n.mu.Unlock()
  180. }
  181. // Stop stops the current swarm node if it is running.
  182. func (n *nodeRunner) Stop() error {
  183. n.mu.Lock()
  184. if n.cancelReconnect != nil { // between restarts
  185. n.cancelReconnect()
  186. n.cancelReconnect = nil
  187. }
  188. if n.swarmNode == nil {
  189. n.mu.Unlock()
  190. return nil
  191. }
  192. n.stopping = true
  193. ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
  194. defer cancel()
  195. n.mu.Unlock()
  196. if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
  197. return err
  198. }
  199. <-n.done
  200. return nil
  201. }
  202. func (n *nodeRunner) State() nodeState {
  203. if n == nil {
  204. return nodeState{status: types.LocalNodeStateInactive}
  205. }
  206. n.mu.RLock()
  207. defer n.mu.RUnlock()
  208. ns := n.nodeState
  209. if ns.err != nil || n.cancelReconnect != nil {
  210. if errors.Cause(ns.err) == errSwarmLocked {
  211. ns.status = types.LocalNodeStateLocked
  212. } else {
  213. ns.status = types.LocalNodeStateError
  214. }
  215. } else {
  216. select {
  217. case <-n.ready:
  218. ns.status = types.LocalNodeStateActive
  219. default:
  220. ns.status = types.LocalNodeStatePending
  221. }
  222. }
  223. return ns
  224. }
  225. func (n *nodeRunner) enableReconnectWatcher() {
  226. if n.stopping {
  227. return
  228. }
  229. n.reconnectDelay *= 2
  230. if n.reconnectDelay > maxReconnectDelay {
  231. n.reconnectDelay = maxReconnectDelay
  232. }
  233. logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds())
  234. delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay)
  235. n.cancelReconnect = cancel
  236. config := n.config
  237. go func() {
  238. <-delayCtx.Done()
  239. if delayCtx.Err() != context.DeadlineExceeded {
  240. return
  241. }
  242. n.mu.Lock()
  243. defer n.mu.Unlock()
  244. if n.stopping {
  245. return
  246. }
  247. config.RemoteAddr = n.cluster.getRemoteAddress()
  248. config.joinAddr = config.RemoteAddr
  249. if err := n.start(config); err != nil {
  250. n.err = err
  251. }
  252. }()
  253. }
  254. // nodeState represents information about the current state of the cluster and
  255. // provides access to the grpc clients.
  256. type nodeState struct {
  257. swarmNode *swarmnode.Node
  258. grpcConn *grpc.ClientConn
  259. controlClient swarmapi.ControlClient
  260. logsClient swarmapi.LogsClient
  261. status types.LocalNodeState
  262. actualLocalAddr string
  263. err error
  264. }
  265. // IsActiveManager returns true if node is a manager ready to accept control requests. It is safe to access the client properties if this returns true.
  266. func (ns nodeState) IsActiveManager() bool {
  267. return ns.controlClient != nil
  268. }
  269. // IsManager returns true if node is a manager.
  270. func (ns nodeState) IsManager() bool {
  271. return ns.swarmNode != nil && ns.swarmNode.Manager() != nil
  272. }
  273. // NodeID returns node's ID or empty string if node is inactive.
  274. func (ns nodeState) NodeID() string {
  275. if ns.swarmNode != nil {
  276. return ns.swarmNode.NodeID()
  277. }
  278. return ""
  279. }