swarm.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548
  1. package cluster
  2. import (
  3. "fmt"
  4. "net"
  5. "strings"
  6. "time"
  7. apitypes "github.com/docker/docker/api/types"
  8. "github.com/docker/docker/api/types/filters"
  9. types "github.com/docker/docker/api/types/swarm"
  10. "github.com/docker/docker/daemon/cluster/convert"
  11. "github.com/docker/docker/opts"
  12. "github.com/docker/docker/pkg/signal"
  13. swarmapi "github.com/docker/swarmkit/api"
  14. "github.com/docker/swarmkit/manager/encryption"
  15. swarmnode "github.com/docker/swarmkit/node"
  16. "github.com/pkg/errors"
  17. "github.com/sirupsen/logrus"
  18. "golang.org/x/net/context"
  19. )
  20. // Init initializes new cluster from user provided request.
  21. func (c *Cluster) Init(req types.InitRequest) (string, error) {
  22. c.controlMutex.Lock()
  23. defer c.controlMutex.Unlock()
  24. if c.nr != nil {
  25. if req.ForceNewCluster {
  26. // Take c.mu temporarily to wait for presently running
  27. // API handlers to finish before shutting down the node.
  28. c.mu.Lock()
  29. c.mu.Unlock()
  30. if err := c.nr.Stop(); err != nil {
  31. return "", err
  32. }
  33. } else {
  34. return "", errSwarmExists
  35. }
  36. }
  37. if err := validateAndSanitizeInitRequest(&req); err != nil {
  38. return "", validationError{err}
  39. }
  40. listenHost, listenPort, err := resolveListenAddr(req.ListenAddr)
  41. if err != nil {
  42. return "", err
  43. }
  44. advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort)
  45. if err != nil {
  46. return "", err
  47. }
  48. dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr)
  49. if err != nil {
  50. return "", err
  51. }
  52. localAddr := listenHost
  53. // If the local address is undetermined, the advertise address
  54. // will be used as local address, if it belongs to this system.
  55. // If the advertise address is not local, then we try to find
  56. // a system address to use as local address. If this fails,
  57. // we give up and ask the user to pass the listen address.
  58. if net.ParseIP(localAddr).IsUnspecified() {
  59. advertiseIP := net.ParseIP(advertiseHost)
  60. found := false
  61. for _, systemIP := range listSystemIPs() {
  62. if systemIP.Equal(advertiseIP) {
  63. localAddr = advertiseIP.String()
  64. found = true
  65. break
  66. }
  67. }
  68. if !found {
  69. ip, err := c.resolveSystemAddr()
  70. if err != nil {
  71. logrus.Warnf("Could not find a local address: %v", err)
  72. return "", errMustSpecifyListenAddr
  73. }
  74. localAddr = ip.String()
  75. }
  76. }
  77. nr, err := c.newNodeRunner(nodeStartConfig{
  78. forceNewCluster: req.ForceNewCluster,
  79. autolock: req.AutoLockManagers,
  80. LocalAddr: localAddr,
  81. ListenAddr: net.JoinHostPort(listenHost, listenPort),
  82. AdvertiseAddr: net.JoinHostPort(advertiseHost, advertisePort),
  83. DataPathAddr: dataPathAddr,
  84. availability: req.Availability,
  85. })
  86. if err != nil {
  87. return "", err
  88. }
  89. c.mu.Lock()
  90. c.nr = nr
  91. c.mu.Unlock()
  92. if err := <-nr.Ready(); err != nil {
  93. c.mu.Lock()
  94. c.nr = nil
  95. c.mu.Unlock()
  96. if !req.ForceNewCluster { // if failure on first attempt don't keep state
  97. if err := clearPersistentState(c.root); err != nil {
  98. return "", err
  99. }
  100. }
  101. return "", err
  102. }
  103. state := nr.State()
  104. if state.swarmNode == nil { // should never happen but protect from panic
  105. return "", errors.New("invalid cluster state for spec initialization")
  106. }
  107. if err := initClusterSpec(state.swarmNode, req.Spec); err != nil {
  108. return "", err
  109. }
  110. return state.NodeID(), nil
  111. }
  112. // Join makes current Cluster part of an existing swarm cluster.
  113. func (c *Cluster) Join(req types.JoinRequest) error {
  114. c.controlMutex.Lock()
  115. defer c.controlMutex.Unlock()
  116. c.mu.Lock()
  117. if c.nr != nil {
  118. c.mu.Unlock()
  119. return errors.WithStack(errSwarmExists)
  120. }
  121. c.mu.Unlock()
  122. if err := validateAndSanitizeJoinRequest(&req); err != nil {
  123. return validationError{err}
  124. }
  125. listenHost, listenPort, err := resolveListenAddr(req.ListenAddr)
  126. if err != nil {
  127. return err
  128. }
  129. var advertiseAddr string
  130. if req.AdvertiseAddr != "" {
  131. advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort)
  132. // For joining, we don't need to provide an advertise address,
  133. // since the remote side can detect it.
  134. if err == nil {
  135. advertiseAddr = net.JoinHostPort(advertiseHost, advertisePort)
  136. }
  137. }
  138. dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr)
  139. if err != nil {
  140. return err
  141. }
  142. nr, err := c.newNodeRunner(nodeStartConfig{
  143. RemoteAddr: req.RemoteAddrs[0],
  144. ListenAddr: net.JoinHostPort(listenHost, listenPort),
  145. AdvertiseAddr: advertiseAddr,
  146. DataPathAddr: dataPathAddr,
  147. joinAddr: req.RemoteAddrs[0],
  148. joinToken: req.JoinToken,
  149. availability: req.Availability,
  150. })
  151. if err != nil {
  152. return err
  153. }
  154. c.mu.Lock()
  155. c.nr = nr
  156. c.mu.Unlock()
  157. select {
  158. case <-time.After(swarmConnectTimeout):
  159. return errSwarmJoinTimeoutReached
  160. case err := <-nr.Ready():
  161. if err != nil {
  162. c.mu.Lock()
  163. c.nr = nil
  164. c.mu.Unlock()
  165. if err := clearPersistentState(c.root); err != nil {
  166. return err
  167. }
  168. }
  169. return err
  170. }
  171. }
  172. // Inspect retrieves the configuration properties of a managed swarm cluster.
  173. func (c *Cluster) Inspect() (types.Swarm, error) {
  174. var swarm *swarmapi.Cluster
  175. if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
  176. s, err := getSwarm(ctx, state.controlClient)
  177. if err != nil {
  178. return err
  179. }
  180. swarm = s
  181. return nil
  182. }); err != nil {
  183. return types.Swarm{}, err
  184. }
  185. return convert.SwarmFromGRPC(*swarm), nil
  186. }
  187. // Update updates configuration of a managed swarm cluster.
  188. func (c *Cluster) Update(version uint64, spec types.Spec, flags types.UpdateFlags) error {
  189. return c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
  190. swarm, err := getSwarm(ctx, state.controlClient)
  191. if err != nil {
  192. return err
  193. }
  194. // In update, client should provide the complete spec of the swarm, including
  195. // Name and Labels. If a field is specified with 0 or nil, then the default value
  196. // will be used to swarmkit.
  197. clusterSpec, err := convert.SwarmSpecToGRPC(spec)
  198. if err != nil {
  199. return convertError{err}
  200. }
  201. _, err = state.controlClient.UpdateCluster(
  202. ctx,
  203. &swarmapi.UpdateClusterRequest{
  204. ClusterID: swarm.ID,
  205. Spec: &clusterSpec,
  206. ClusterVersion: &swarmapi.Version{
  207. Index: version,
  208. },
  209. Rotation: swarmapi.KeyRotation{
  210. WorkerJoinToken: flags.RotateWorkerToken,
  211. ManagerJoinToken: flags.RotateManagerToken,
  212. ManagerUnlockKey: flags.RotateManagerUnlockKey,
  213. },
  214. },
  215. )
  216. return err
  217. })
  218. }
  219. // GetUnlockKey returns the unlock key for the swarm.
  220. func (c *Cluster) GetUnlockKey() (string, error) {
  221. var resp *swarmapi.GetUnlockKeyResponse
  222. if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
  223. client := swarmapi.NewCAClient(state.grpcConn)
  224. r, err := client.GetUnlockKey(ctx, &swarmapi.GetUnlockKeyRequest{})
  225. if err != nil {
  226. return err
  227. }
  228. resp = r
  229. return nil
  230. }); err != nil {
  231. return "", err
  232. }
  233. if len(resp.UnlockKey) == 0 {
  234. // no key
  235. return "", nil
  236. }
  237. return encryption.HumanReadableKey(resp.UnlockKey), nil
  238. }
  239. // UnlockSwarm provides a key to decrypt data that is encrypted at rest.
  240. func (c *Cluster) UnlockSwarm(req types.UnlockRequest) error {
  241. c.controlMutex.Lock()
  242. defer c.controlMutex.Unlock()
  243. c.mu.RLock()
  244. state := c.currentNodeState()
  245. if !state.IsActiveManager() {
  246. // when manager is not active,
  247. // unless it is locked, otherwise return error.
  248. if err := c.errNoManager(state); err != errSwarmLocked {
  249. c.mu.RUnlock()
  250. return err
  251. }
  252. } else {
  253. // when manager is active, return an error of "not locked"
  254. c.mu.RUnlock()
  255. return notLockedError{}
  256. }
  257. // only when swarm is locked, code running reaches here
  258. nr := c.nr
  259. c.mu.RUnlock()
  260. key, err := encryption.ParseHumanReadableKey(req.UnlockKey)
  261. if err != nil {
  262. return validationError{err}
  263. }
  264. config := nr.config
  265. config.lockKey = key
  266. if err := nr.Stop(); err != nil {
  267. return err
  268. }
  269. nr, err = c.newNodeRunner(config)
  270. if err != nil {
  271. return err
  272. }
  273. c.mu.Lock()
  274. c.nr = nr
  275. c.mu.Unlock()
  276. if err := <-nr.Ready(); err != nil {
  277. if errors.Cause(err) == errSwarmLocked {
  278. return invalidUnlockKey{}
  279. }
  280. return errors.Errorf("swarm component could not be started: %v", err)
  281. }
  282. return nil
  283. }
  284. // Leave shuts down Cluster and removes current state.
  285. func (c *Cluster) Leave(force bool) error {
  286. c.controlMutex.Lock()
  287. defer c.controlMutex.Unlock()
  288. c.mu.Lock()
  289. nr := c.nr
  290. if nr == nil {
  291. c.mu.Unlock()
  292. return errors.WithStack(errNoSwarm)
  293. }
  294. state := c.currentNodeState()
  295. c.mu.Unlock()
  296. if errors.Cause(state.err) == errSwarmLocked && !force {
  297. // leave a locked swarm without --force is not allowed
  298. return errors.WithStack(notAvailableError("Swarm is encrypted and locked. Please unlock it first or use `--force` to ignore this message."))
  299. }
  300. if state.IsManager() && !force {
  301. msg := "You are attempting to leave the swarm on a node that is participating as a manager. "
  302. if state.IsActiveManager() {
  303. active, reachable, unreachable, err := managerStats(state.controlClient, state.NodeID())
  304. if err == nil {
  305. if active && removingManagerCausesLossOfQuorum(reachable, unreachable) {
  306. if isLastManager(reachable, unreachable) {
  307. msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. "
  308. return errors.WithStack(notAvailableError(msg))
  309. }
  310. msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable)
  311. }
  312. }
  313. } else {
  314. msg += "Doing so may lose the consensus of your cluster. "
  315. }
  316. msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message."
  317. return errors.WithStack(notAvailableError(msg))
  318. }
  319. // release readers in here
  320. if err := nr.Stop(); err != nil {
  321. logrus.Errorf("failed to shut down cluster node: %v", err)
  322. signal.DumpStacks("")
  323. return err
  324. }
  325. c.mu.Lock()
  326. c.nr = nil
  327. c.mu.Unlock()
  328. if nodeID := state.NodeID(); nodeID != "" {
  329. nodeContainers, err := c.listContainerForNode(nodeID)
  330. if err != nil {
  331. return err
  332. }
  333. for _, id := range nodeContainers {
  334. if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil {
  335. logrus.Errorf("error removing %v: %v", id, err)
  336. }
  337. }
  338. }
  339. // todo: cleanup optional?
  340. if err := clearPersistentState(c.root); err != nil {
  341. return err
  342. }
  343. c.config.Backend.DaemonLeavesCluster()
  344. return nil
  345. }
  346. // Info returns information about the current cluster state.
  347. func (c *Cluster) Info() types.Info {
  348. info := types.Info{
  349. NodeAddr: c.GetAdvertiseAddress(),
  350. }
  351. c.mu.RLock()
  352. defer c.mu.RUnlock()
  353. state := c.currentNodeState()
  354. info.LocalNodeState = state.status
  355. if state.err != nil {
  356. info.Error = state.err.Error()
  357. }
  358. ctx, cancel := c.getRequestContext()
  359. defer cancel()
  360. if state.IsActiveManager() {
  361. info.ControlAvailable = true
  362. swarm, err := c.Inspect()
  363. if err != nil {
  364. info.Error = err.Error()
  365. }
  366. info.Cluster = &swarm.ClusterInfo
  367. if r, err := state.controlClient.ListNodes(ctx, &swarmapi.ListNodesRequest{}); err != nil {
  368. info.Error = err.Error()
  369. } else {
  370. info.Nodes = len(r.Nodes)
  371. for _, n := range r.Nodes {
  372. if n.ManagerStatus != nil {
  373. info.Managers = info.Managers + 1
  374. }
  375. }
  376. }
  377. }
  378. if state.swarmNode != nil {
  379. for _, r := range state.swarmNode.Remotes() {
  380. info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr})
  381. }
  382. info.NodeID = state.swarmNode.NodeID()
  383. }
  384. return info
  385. }
  386. func validateAndSanitizeInitRequest(req *types.InitRequest) error {
  387. var err error
  388. req.ListenAddr, err = validateAddr(req.ListenAddr)
  389. if err != nil {
  390. return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
  391. }
  392. if req.Spec.Annotations.Name == "" {
  393. req.Spec.Annotations.Name = "default"
  394. } else if req.Spec.Annotations.Name != "default" {
  395. return errors.New(`swarm spec must be named "default"`)
  396. }
  397. return nil
  398. }
  399. func validateAndSanitizeJoinRequest(req *types.JoinRequest) error {
  400. var err error
  401. req.ListenAddr, err = validateAddr(req.ListenAddr)
  402. if err != nil {
  403. return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
  404. }
  405. if len(req.RemoteAddrs) == 0 {
  406. return errors.New("at least 1 RemoteAddr is required to join")
  407. }
  408. for i := range req.RemoteAddrs {
  409. req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i])
  410. if err != nil {
  411. return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err)
  412. }
  413. }
  414. return nil
  415. }
  416. func validateAddr(addr string) (string, error) {
  417. if addr == "" {
  418. return addr, errors.New("invalid empty address")
  419. }
  420. newaddr, err := opts.ParseTCPAddr(addr, defaultAddr)
  421. if err != nil {
  422. return addr, nil
  423. }
  424. return strings.TrimPrefix(newaddr, "tcp://"), nil
  425. }
  426. func initClusterSpec(node *swarmnode.Node, spec types.Spec) error {
  427. ctx, _ := context.WithTimeout(context.Background(), 5*time.Second)
  428. for conn := range node.ListenControlSocket(ctx) {
  429. if ctx.Err() != nil {
  430. return ctx.Err()
  431. }
  432. if conn != nil {
  433. client := swarmapi.NewControlClient(conn)
  434. var cluster *swarmapi.Cluster
  435. for i := 0; ; i++ {
  436. lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{})
  437. if err != nil {
  438. return fmt.Errorf("error on listing clusters: %v", err)
  439. }
  440. if len(lcr.Clusters) == 0 {
  441. if i < 10 {
  442. time.Sleep(200 * time.Millisecond)
  443. continue
  444. }
  445. return errors.New("empty list of clusters was returned")
  446. }
  447. cluster = lcr.Clusters[0]
  448. break
  449. }
  450. // In init, we take the initial default values from swarmkit, and merge
  451. // any non nil or 0 value from spec to GRPC spec. This will leave the
  452. // default value alone.
  453. // Note that this is different from Update(), as in Update() we expect
  454. // user to specify the complete spec of the cluster (as they already know
  455. // the existing one and knows which field to update)
  456. clusterSpec, err := convert.MergeSwarmSpecToGRPC(spec, cluster.Spec)
  457. if err != nil {
  458. return fmt.Errorf("error updating cluster settings: %v", err)
  459. }
  460. _, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{
  461. ClusterID: cluster.ID,
  462. ClusterVersion: &cluster.Meta.Version,
  463. Spec: &clusterSpec,
  464. })
  465. if err != nil {
  466. return fmt.Errorf("error updating cluster settings: %v", err)
  467. }
  468. return nil
  469. }
  470. }
  471. return ctx.Err()
  472. }
  473. func (c *Cluster) listContainerForNode(nodeID string) ([]string, error) {
  474. var ids []string
  475. filters := filters.NewArgs()
  476. filters.Add("label", fmt.Sprintf("com.docker.swarm.node.id=%s", nodeID))
  477. containers, err := c.config.Backend.Containers(&apitypes.ContainerListOptions{
  478. Filters: filters,
  479. })
  480. if err != nil {
  481. return []string{}, err
  482. }
  483. for _, c := range containers {
  484. ids = append(ids, c.ID)
  485. }
  486. return ids, nil
  487. }