noderunner.go 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. package cluster
  2. import (
  3. "fmt"
  4. "path/filepath"
  5. "runtime"
  6. "strings"
  7. "sync"
  8. "time"
  9. types "github.com/docker/docker/api/types/swarm"
  10. "github.com/docker/docker/daemon/cluster/executor/container"
  11. lncluster "github.com/docker/libnetwork/cluster"
  12. swarmapi "github.com/docker/swarmkit/api"
  13. swarmnode "github.com/docker/swarmkit/node"
  14. "github.com/pkg/errors"
  15. "github.com/sirupsen/logrus"
  16. "golang.org/x/net/context"
  17. "google.golang.org/grpc"
  18. )
  19. // nodeRunner implements a manager for continuously running swarmkit node, restarting them with backoff delays if needed.
  20. type nodeRunner struct {
  21. nodeState
  22. mu sync.RWMutex
  23. done chan struct{} // closed when swarmNode exits
  24. ready chan struct{} // closed when swarmNode becomes active
  25. reconnectDelay time.Duration
  26. config nodeStartConfig
  27. repeatedRun bool
  28. cancelReconnect func()
  29. stopping bool
  30. cluster *Cluster // only for accessing config helpers, never call any methods. TODO: change to config struct
  31. }
  32. // nodeStartConfig holds configuration needed to start a new node. Exported
  33. // fields of this structure are saved to disk in json. Unexported fields
  34. // contain data that shouldn't be persisted between daemon reloads.
  35. type nodeStartConfig struct {
  36. // LocalAddr is this machine's local IP or hostname, if specified.
  37. LocalAddr string
  38. // RemoteAddr is the address that was given to "swarm join". It is used
  39. // to find LocalAddr if necessary.
  40. RemoteAddr string
  41. // ListenAddr is the address we bind to, including a port.
  42. ListenAddr string
  43. // AdvertiseAddr is the address other nodes should connect to,
  44. // including a port.
  45. AdvertiseAddr string
  46. // DataPathAddr is the address that has to be used for the data path
  47. DataPathAddr string
  48. // JoinInProgress is set to true if a join operation has started, but
  49. // not completed yet.
  50. JoinInProgress bool
  51. joinAddr string
  52. forceNewCluster bool
  53. joinToken string
  54. lockKey []byte
  55. autolock bool
  56. availability types.NodeAvailability
  57. }
  58. func (n *nodeRunner) Ready() chan error {
  59. c := make(chan error, 1)
  60. n.mu.RLock()
  61. ready, done := n.ready, n.done
  62. n.mu.RUnlock()
  63. go func() {
  64. select {
  65. case <-ready:
  66. case <-done:
  67. }
  68. select {
  69. case <-ready:
  70. default:
  71. n.mu.RLock()
  72. c <- n.err
  73. n.mu.RUnlock()
  74. }
  75. close(c)
  76. }()
  77. return c
  78. }
  79. func (n *nodeRunner) Start(conf nodeStartConfig) error {
  80. n.mu.Lock()
  81. defer n.mu.Unlock()
  82. n.reconnectDelay = initialReconnectDelay
  83. return n.start(conf)
  84. }
  85. func (n *nodeRunner) start(conf nodeStartConfig) error {
  86. var control string
  87. if runtime.GOOS == "windows" {
  88. control = `\\.\pipe\` + controlSocket
  89. } else {
  90. control = filepath.Join(n.cluster.runtimeRoot, controlSocket)
  91. }
  92. joinAddr := conf.joinAddr
  93. if joinAddr == "" && conf.JoinInProgress {
  94. // We must have been restarted while trying to join a cluster.
  95. // Continue trying to join instead of forming our own cluster.
  96. joinAddr = conf.RemoteAddr
  97. }
  98. // Hostname is not set here. Instead, it is obtained from
  99. // the node description that is reported periodically
  100. swarmnodeConfig := swarmnode.Config{
  101. ForceNewCluster: conf.forceNewCluster,
  102. ListenControlAPI: control,
  103. ListenRemoteAPI: conf.ListenAddr,
  104. AdvertiseRemoteAPI: conf.AdvertiseAddr,
  105. JoinAddr: joinAddr,
  106. StateDir: n.cluster.root,
  107. JoinToken: conf.joinToken,
  108. Executor: container.NewExecutor(n.cluster.config.Backend, n.cluster.config.PluginBackend),
  109. HeartbeatTick: 1,
  110. ElectionTick: 3,
  111. UnlockKey: conf.lockKey,
  112. AutoLockManagers: conf.autolock,
  113. PluginGetter: n.cluster.config.Backend.PluginGetter(),
  114. }
  115. if conf.availability != "" {
  116. avail, ok := swarmapi.NodeSpec_Availability_value[strings.ToUpper(string(conf.availability))]
  117. if !ok {
  118. return fmt.Errorf("invalid Availability: %q", conf.availability)
  119. }
  120. swarmnodeConfig.Availability = swarmapi.NodeSpec_Availability(avail)
  121. }
  122. node, err := swarmnode.New(&swarmnodeConfig)
  123. if err != nil {
  124. return err
  125. }
  126. if err := node.Start(context.Background()); err != nil {
  127. return err
  128. }
  129. n.done = make(chan struct{})
  130. n.ready = make(chan struct{})
  131. n.swarmNode = node
  132. if conf.joinAddr != "" {
  133. conf.JoinInProgress = true
  134. }
  135. n.config = conf
  136. savePersistentState(n.cluster.root, conf)
  137. ctx, cancel := context.WithCancel(context.Background())
  138. go func() {
  139. n.handleNodeExit(node)
  140. cancel()
  141. }()
  142. go n.handleReadyEvent(ctx, node, n.ready)
  143. go n.handleControlSocketChange(ctx, node)
  144. return nil
  145. }
  146. func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmnode.Node) {
  147. for conn := range node.ListenControlSocket(ctx) {
  148. n.mu.Lock()
  149. if n.grpcConn != conn {
  150. if conn == nil {
  151. n.controlClient = nil
  152. n.logsClient = nil
  153. } else {
  154. n.controlClient = swarmapi.NewControlClient(conn)
  155. n.logsClient = swarmapi.NewLogsClient(conn)
  156. // push store changes to daemon
  157. go n.watchClusterEvents(ctx, conn)
  158. }
  159. }
  160. n.grpcConn = conn
  161. n.mu.Unlock()
  162. n.cluster.SendClusterEvent(lncluster.EventSocketChange)
  163. }
  164. }
  165. func (n *nodeRunner) watchClusterEvents(ctx context.Context, conn *grpc.ClientConn) {
  166. client := swarmapi.NewWatchClient(conn)
  167. watch, err := client.Watch(ctx, &swarmapi.WatchRequest{
  168. Entries: []*swarmapi.WatchRequest_WatchEntry{
  169. {
  170. Kind: "node",
  171. Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
  172. },
  173. {
  174. Kind: "service",
  175. Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
  176. },
  177. {
  178. Kind: "network",
  179. Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
  180. },
  181. {
  182. Kind: "secret",
  183. Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
  184. },
  185. {
  186. Kind: "config",
  187. Action: swarmapi.WatchActionKindCreate | swarmapi.WatchActionKindUpdate | swarmapi.WatchActionKindRemove,
  188. },
  189. },
  190. IncludeOldObject: true,
  191. })
  192. if err != nil {
  193. logrus.WithError(err).Error("failed to watch cluster store")
  194. return
  195. }
  196. for {
  197. msg, err := watch.Recv()
  198. if err != nil {
  199. // store watch is broken
  200. logrus.WithError(err).Error("failed to receive changes from store watch API")
  201. return
  202. }
  203. select {
  204. case <-ctx.Done():
  205. return
  206. case n.cluster.watchStream <- msg:
  207. }
  208. }
  209. }
  210. func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node, ready chan struct{}) {
  211. select {
  212. case <-node.Ready():
  213. n.mu.Lock()
  214. n.err = nil
  215. if n.config.JoinInProgress {
  216. n.config.JoinInProgress = false
  217. savePersistentState(n.cluster.root, n.config)
  218. }
  219. n.mu.Unlock()
  220. close(ready)
  221. case <-ctx.Done():
  222. }
  223. n.cluster.SendClusterEvent(lncluster.EventNodeReady)
  224. }
  225. func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) {
  226. err := detectLockedError(node.Err(context.Background()))
  227. if err != nil {
  228. logrus.Errorf("cluster exited with error: %v", err)
  229. }
  230. n.mu.Lock()
  231. n.swarmNode = nil
  232. n.err = err
  233. close(n.done)
  234. select {
  235. case <-n.ready:
  236. n.enableReconnectWatcher()
  237. default:
  238. if n.repeatedRun {
  239. n.enableReconnectWatcher()
  240. }
  241. }
  242. n.repeatedRun = true
  243. n.mu.Unlock()
  244. }
  245. // Stop stops the current swarm node if it is running.
  246. func (n *nodeRunner) Stop() error {
  247. n.mu.Lock()
  248. if n.cancelReconnect != nil { // between restarts
  249. n.cancelReconnect()
  250. n.cancelReconnect = nil
  251. }
  252. if n.swarmNode == nil {
  253. n.mu.Unlock()
  254. return nil
  255. }
  256. n.stopping = true
  257. ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
  258. defer cancel()
  259. n.mu.Unlock()
  260. if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
  261. return err
  262. }
  263. n.cluster.SendClusterEvent(lncluster.EventNodeLeave)
  264. <-n.done
  265. return nil
  266. }
  267. func (n *nodeRunner) State() nodeState {
  268. if n == nil {
  269. return nodeState{status: types.LocalNodeStateInactive}
  270. }
  271. n.mu.RLock()
  272. defer n.mu.RUnlock()
  273. ns := n.nodeState
  274. if ns.err != nil || n.cancelReconnect != nil {
  275. if errors.Cause(ns.err) == errSwarmLocked {
  276. ns.status = types.LocalNodeStateLocked
  277. } else {
  278. ns.status = types.LocalNodeStateError
  279. }
  280. } else {
  281. select {
  282. case <-n.ready:
  283. ns.status = types.LocalNodeStateActive
  284. default:
  285. ns.status = types.LocalNodeStatePending
  286. }
  287. }
  288. return ns
  289. }
  290. func (n *nodeRunner) enableReconnectWatcher() {
  291. if n.stopping {
  292. return
  293. }
  294. n.reconnectDelay *= 2
  295. if n.reconnectDelay > maxReconnectDelay {
  296. n.reconnectDelay = maxReconnectDelay
  297. }
  298. logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds())
  299. delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay)
  300. n.cancelReconnect = cancel
  301. go func() {
  302. <-delayCtx.Done()
  303. if delayCtx.Err() != context.DeadlineExceeded {
  304. return
  305. }
  306. n.mu.Lock()
  307. defer n.mu.Unlock()
  308. if n.stopping {
  309. return
  310. }
  311. if err := n.start(n.config); err != nil {
  312. n.err = err
  313. }
  314. }()
  315. }
  316. // nodeState represents information about the current state of the cluster and
  317. // provides access to the grpc clients.
  318. type nodeState struct {
  319. swarmNode *swarmnode.Node
  320. grpcConn *grpc.ClientConn
  321. controlClient swarmapi.ControlClient
  322. logsClient swarmapi.LogsClient
  323. status types.LocalNodeState
  324. actualLocalAddr string
  325. err error
  326. }
  327. // IsActiveManager returns true if node is a manager ready to accept control requests. It is safe to access the client properties if this returns true.
  328. func (ns nodeState) IsActiveManager() bool {
  329. return ns.controlClient != nil
  330. }
  331. // IsManager returns true if node is a manager.
  332. func (ns nodeState) IsManager() bool {
  333. return ns.swarmNode != nil && ns.swarmNode.Manager() != nil
  334. }
  335. // NodeID returns node's ID or empty string if node is inactive.
  336. func (ns nodeState) NodeID() string {
  337. if ns.swarmNode != nil {
  338. return ns.swarmNode.NodeID()
  339. }
  340. return ""
  341. }