swarm.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555
  1. package cluster
  2. import (
  3. "fmt"
  4. "net"
  5. "strings"
  6. "time"
  7. "github.com/Sirupsen/logrus"
  8. apierrors "github.com/docker/docker/api/errors"
  9. apitypes "github.com/docker/docker/api/types"
  10. "github.com/docker/docker/api/types/filters"
  11. types "github.com/docker/docker/api/types/swarm"
  12. "github.com/docker/docker/daemon/cluster/convert"
  13. "github.com/docker/docker/opts"
  14. "github.com/docker/docker/pkg/signal"
  15. swarmapi "github.com/docker/swarmkit/api"
  16. "github.com/docker/swarmkit/manager/encryption"
  17. swarmnode "github.com/docker/swarmkit/node"
  18. "github.com/pkg/errors"
  19. "golang.org/x/net/context"
  20. )
  21. // Init initializes new cluster from user provided request.
  22. func (c *Cluster) Init(req types.InitRequest) (string, error) {
  23. c.controlMutex.Lock()
  24. defer c.controlMutex.Unlock()
  25. if c.nr != nil {
  26. if req.ForceNewCluster {
  27. // Take c.mu temporarily to wait for presently running
  28. // API handlers to finish before shutting down the node.
  29. c.mu.Lock()
  30. c.mu.Unlock()
  31. if err := c.nr.Stop(); err != nil {
  32. return "", err
  33. }
  34. } else {
  35. return "", errSwarmExists
  36. }
  37. }
  38. if err := validateAndSanitizeInitRequest(&req); err != nil {
  39. return "", apierrors.NewBadRequestError(err)
  40. }
  41. listenHost, listenPort, err := resolveListenAddr(req.ListenAddr)
  42. if err != nil {
  43. return "", err
  44. }
  45. advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort)
  46. if err != nil {
  47. return "", err
  48. }
  49. dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr)
  50. if err != nil {
  51. return "", err
  52. }
  53. localAddr := listenHost
  54. // If the local address is undetermined, the advertise address
  55. // will be used as local address, if it belongs to this system.
  56. // If the advertise address is not local, then we try to find
  57. // a system address to use as local address. If this fails,
  58. // we give up and ask the user to pass the listen address.
  59. if net.ParseIP(localAddr).IsUnspecified() {
  60. advertiseIP := net.ParseIP(advertiseHost)
  61. found := false
  62. for _, systemIP := range listSystemIPs() {
  63. if systemIP.Equal(advertiseIP) {
  64. localAddr = advertiseIP.String()
  65. found = true
  66. break
  67. }
  68. }
  69. if !found {
  70. ip, err := c.resolveSystemAddr()
  71. if err != nil {
  72. logrus.Warnf("Could not find a local address: %v", err)
  73. return "", errMustSpecifyListenAddr
  74. }
  75. localAddr = ip.String()
  76. }
  77. }
  78. if !req.ForceNewCluster {
  79. clearPersistentState(c.root)
  80. }
  81. nr, err := c.newNodeRunner(nodeStartConfig{
  82. forceNewCluster: req.ForceNewCluster,
  83. autolock: req.AutoLockManagers,
  84. LocalAddr: localAddr,
  85. ListenAddr: net.JoinHostPort(listenHost, listenPort),
  86. AdvertiseAddr: net.JoinHostPort(advertiseHost, advertisePort),
  87. DataPathAddr: dataPathAddr,
  88. availability: req.Availability,
  89. })
  90. if err != nil {
  91. return "", err
  92. }
  93. c.mu.Lock()
  94. c.nr = nr
  95. c.mu.Unlock()
  96. if err := <-nr.Ready(); err != nil {
  97. if !req.ForceNewCluster { // if failure on first attempt don't keep state
  98. if err := clearPersistentState(c.root); err != nil {
  99. return "", err
  100. }
  101. }
  102. if err != nil {
  103. c.mu.Lock()
  104. c.nr = nil
  105. c.mu.Unlock()
  106. }
  107. return "", err
  108. }
  109. state := nr.State()
  110. if state.swarmNode == nil { // should never happen but protect from panic
  111. return "", errors.New("invalid cluster state for spec initialization")
  112. }
  113. if err := initClusterSpec(state.swarmNode, req.Spec); err != nil {
  114. return "", err
  115. }
  116. return state.NodeID(), nil
  117. }
  118. // Join makes current Cluster part of an existing swarm cluster.
  119. func (c *Cluster) Join(req types.JoinRequest) error {
  120. c.controlMutex.Lock()
  121. defer c.controlMutex.Unlock()
  122. c.mu.Lock()
  123. if c.nr != nil {
  124. c.mu.Unlock()
  125. return errSwarmExists
  126. }
  127. c.mu.Unlock()
  128. if err := validateAndSanitizeJoinRequest(&req); err != nil {
  129. return apierrors.NewBadRequestError(err)
  130. }
  131. listenHost, listenPort, err := resolveListenAddr(req.ListenAddr)
  132. if err != nil {
  133. return err
  134. }
  135. var advertiseAddr string
  136. if req.AdvertiseAddr != "" {
  137. advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort)
  138. // For joining, we don't need to provide an advertise address,
  139. // since the remote side can detect it.
  140. if err == nil {
  141. advertiseAddr = net.JoinHostPort(advertiseHost, advertisePort)
  142. }
  143. }
  144. dataPathAddr, err := resolveDataPathAddr(req.DataPathAddr)
  145. if err != nil {
  146. return err
  147. }
  148. clearPersistentState(c.root)
  149. nr, err := c.newNodeRunner(nodeStartConfig{
  150. RemoteAddr: req.RemoteAddrs[0],
  151. ListenAddr: net.JoinHostPort(listenHost, listenPort),
  152. AdvertiseAddr: advertiseAddr,
  153. DataPathAddr: dataPathAddr,
  154. joinAddr: req.RemoteAddrs[0],
  155. joinToken: req.JoinToken,
  156. availability: req.Availability,
  157. })
  158. if err != nil {
  159. return err
  160. }
  161. c.mu.Lock()
  162. c.nr = nr
  163. c.mu.Unlock()
  164. select {
  165. case <-time.After(swarmConnectTimeout):
  166. return errSwarmJoinTimeoutReached
  167. case err := <-nr.Ready():
  168. if err != nil {
  169. c.mu.Lock()
  170. c.nr = nil
  171. c.mu.Unlock()
  172. }
  173. return err
  174. }
  175. }
  176. // Inspect retrieves the configuration properties of a managed swarm cluster.
  177. func (c *Cluster) Inspect() (types.Swarm, error) {
  178. var swarm *swarmapi.Cluster
  179. if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
  180. s, err := getSwarm(ctx, state.controlClient)
  181. if err != nil {
  182. return err
  183. }
  184. swarm = s
  185. return nil
  186. }); err != nil {
  187. return types.Swarm{}, err
  188. }
  189. return convert.SwarmFromGRPC(*swarm), nil
  190. }
  191. // Update updates configuration of a managed swarm cluster.
  192. func (c *Cluster) Update(version uint64, spec types.Spec, flags types.UpdateFlags) error {
  193. return c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
  194. swarm, err := getSwarm(ctx, state.controlClient)
  195. if err != nil {
  196. return err
  197. }
  198. // In update, client should provide the complete spec of the swarm, including
  199. // Name and Labels. If a field is specified with 0 or nil, then the default value
  200. // will be used to swarmkit.
  201. clusterSpec, err := convert.SwarmSpecToGRPC(spec)
  202. if err != nil {
  203. return apierrors.NewBadRequestError(err)
  204. }
  205. _, err = state.controlClient.UpdateCluster(
  206. ctx,
  207. &swarmapi.UpdateClusterRequest{
  208. ClusterID: swarm.ID,
  209. Spec: &clusterSpec,
  210. ClusterVersion: &swarmapi.Version{
  211. Index: version,
  212. },
  213. Rotation: swarmapi.KeyRotation{
  214. WorkerJoinToken: flags.RotateWorkerToken,
  215. ManagerJoinToken: flags.RotateManagerToken,
  216. ManagerUnlockKey: flags.RotateManagerUnlockKey,
  217. },
  218. },
  219. )
  220. return err
  221. })
  222. }
  223. // GetUnlockKey returns the unlock key for the swarm.
  224. func (c *Cluster) GetUnlockKey() (string, error) {
  225. var resp *swarmapi.GetUnlockKeyResponse
  226. if err := c.lockedManagerAction(func(ctx context.Context, state nodeState) error {
  227. client := swarmapi.NewCAClient(state.grpcConn)
  228. r, err := client.GetUnlockKey(ctx, &swarmapi.GetUnlockKeyRequest{})
  229. if err != nil {
  230. return err
  231. }
  232. resp = r
  233. return nil
  234. }); err != nil {
  235. return "", err
  236. }
  237. if len(resp.UnlockKey) == 0 {
  238. // no key
  239. return "", nil
  240. }
  241. return encryption.HumanReadableKey(resp.UnlockKey), nil
  242. }
  243. // UnlockSwarm provides a key to decrypt data that is encrypted at rest.
  244. func (c *Cluster) UnlockSwarm(req types.UnlockRequest) error {
  245. c.controlMutex.Lock()
  246. defer c.controlMutex.Unlock()
  247. c.mu.RLock()
  248. state := c.currentNodeState()
  249. if !state.IsActiveManager() {
  250. // when manager is not active,
  251. // unless it is locked, otherwise return error.
  252. if err := c.errNoManager(state); err != errSwarmLocked {
  253. c.mu.RUnlock()
  254. return err
  255. }
  256. } else {
  257. // when manager is active, return an error of "not locked"
  258. c.mu.RUnlock()
  259. return errors.New("swarm is not locked")
  260. }
  261. // only when swarm is locked, code running reaches here
  262. nr := c.nr
  263. c.mu.RUnlock()
  264. key, err := encryption.ParseHumanReadableKey(req.UnlockKey)
  265. if err != nil {
  266. return err
  267. }
  268. config := nr.config
  269. config.lockKey = key
  270. if err := nr.Stop(); err != nil {
  271. return err
  272. }
  273. nr, err = c.newNodeRunner(config)
  274. if err != nil {
  275. return err
  276. }
  277. c.mu.Lock()
  278. c.nr = nr
  279. c.mu.Unlock()
  280. if err := <-nr.Ready(); err != nil {
  281. if errors.Cause(err) == errSwarmLocked {
  282. return errors.New("swarm could not be unlocked: invalid key provided")
  283. }
  284. return fmt.Errorf("swarm component could not be started: %v", err)
  285. }
  286. return nil
  287. }
  288. // Leave shuts down Cluster and removes current state.
  289. func (c *Cluster) Leave(force bool) error {
  290. c.controlMutex.Lock()
  291. defer c.controlMutex.Unlock()
  292. c.mu.Lock()
  293. nr := c.nr
  294. if nr == nil {
  295. c.mu.Unlock()
  296. return errNoSwarm
  297. }
  298. state := c.currentNodeState()
  299. c.mu.Unlock()
  300. if errors.Cause(state.err) == errSwarmLocked && !force {
  301. // leave a locked swarm without --force is not allowed
  302. return errors.New("Swarm is encrypted and locked. Please unlock it first or use `--force` to ignore this message.")
  303. }
  304. if state.IsManager() && !force {
  305. msg := "You are attempting to leave the swarm on a node that is participating as a manager. "
  306. if state.IsActiveManager() {
  307. active, reachable, unreachable, err := managerStats(state.controlClient, state.NodeID())
  308. if err == nil {
  309. if active && removingManagerCausesLossOfQuorum(reachable, unreachable) {
  310. if isLastManager(reachable, unreachable) {
  311. msg += "Removing the last manager erases all current state of the swarm. Use `--force` to ignore this message. "
  312. return errors.New(msg)
  313. }
  314. msg += fmt.Sprintf("Removing this node leaves %v managers out of %v. Without a Raft quorum your swarm will be inaccessible. ", reachable-1, reachable+unreachable)
  315. }
  316. }
  317. } else {
  318. msg += "Doing so may lose the consensus of your cluster. "
  319. }
  320. msg += "The only way to restore a swarm that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to suppress this message."
  321. return errors.New(msg)
  322. }
  323. // release readers in here
  324. if err := nr.Stop(); err != nil {
  325. logrus.Errorf("failed to shut down cluster node: %v", err)
  326. signal.DumpStacks("")
  327. return err
  328. }
  329. c.mu.Lock()
  330. c.nr = nil
  331. c.mu.Unlock()
  332. if nodeID := state.NodeID(); nodeID != "" {
  333. nodeContainers, err := c.listContainerForNode(nodeID)
  334. if err != nil {
  335. return err
  336. }
  337. for _, id := range nodeContainers {
  338. if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil {
  339. logrus.Errorf("error removing %v: %v", id, err)
  340. }
  341. }
  342. }
  343. c.configEvent <- struct{}{}
  344. // todo: cleanup optional?
  345. if err := clearPersistentState(c.root); err != nil {
  346. return err
  347. }
  348. c.config.Backend.DaemonLeavesCluster()
  349. return nil
  350. }
  351. // Info returns information about the current cluster state.
  352. func (c *Cluster) Info() types.Info {
  353. info := types.Info{
  354. NodeAddr: c.GetAdvertiseAddress(),
  355. }
  356. c.mu.RLock()
  357. defer c.mu.RUnlock()
  358. state := c.currentNodeState()
  359. info.LocalNodeState = state.status
  360. if state.err != nil {
  361. info.Error = state.err.Error()
  362. }
  363. ctx, cancel := c.getRequestContext()
  364. defer cancel()
  365. if state.IsActiveManager() {
  366. info.ControlAvailable = true
  367. swarm, err := c.Inspect()
  368. if err != nil {
  369. info.Error = err.Error()
  370. }
  371. info.Cluster = &swarm.ClusterInfo
  372. if r, err := state.controlClient.ListNodes(ctx, &swarmapi.ListNodesRequest{}); err != nil {
  373. info.Error = err.Error()
  374. } else {
  375. info.Nodes = len(r.Nodes)
  376. for _, n := range r.Nodes {
  377. if n.ManagerStatus != nil {
  378. info.Managers = info.Managers + 1
  379. }
  380. }
  381. }
  382. }
  383. if state.swarmNode != nil {
  384. for _, r := range state.swarmNode.Remotes() {
  385. info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr})
  386. }
  387. info.NodeID = state.swarmNode.NodeID()
  388. }
  389. return info
  390. }
  391. func validateAndSanitizeInitRequest(req *types.InitRequest) error {
  392. var err error
  393. req.ListenAddr, err = validateAddr(req.ListenAddr)
  394. if err != nil {
  395. return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
  396. }
  397. if req.Spec.Annotations.Name == "" {
  398. req.Spec.Annotations.Name = "default"
  399. } else if req.Spec.Annotations.Name != "default" {
  400. return errors.New(`swarm spec must be named "default"`)
  401. }
  402. return nil
  403. }
  404. func validateAndSanitizeJoinRequest(req *types.JoinRequest) error {
  405. var err error
  406. req.ListenAddr, err = validateAddr(req.ListenAddr)
  407. if err != nil {
  408. return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
  409. }
  410. if len(req.RemoteAddrs) == 0 {
  411. return errors.New("at least 1 RemoteAddr is required to join")
  412. }
  413. for i := range req.RemoteAddrs {
  414. req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i])
  415. if err != nil {
  416. return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err)
  417. }
  418. }
  419. return nil
  420. }
  421. func validateAddr(addr string) (string, error) {
  422. if addr == "" {
  423. return addr, errors.New("invalid empty address")
  424. }
  425. newaddr, err := opts.ParseTCPAddr(addr, defaultAddr)
  426. if err != nil {
  427. return addr, nil
  428. }
  429. return strings.TrimPrefix(newaddr, "tcp://"), nil
  430. }
  431. func initClusterSpec(node *swarmnode.Node, spec types.Spec) error {
  432. ctx, _ := context.WithTimeout(context.Background(), 5*time.Second)
  433. for conn := range node.ListenControlSocket(ctx) {
  434. if ctx.Err() != nil {
  435. return ctx.Err()
  436. }
  437. if conn != nil {
  438. client := swarmapi.NewControlClient(conn)
  439. var cluster *swarmapi.Cluster
  440. for i := 0; ; i++ {
  441. lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{})
  442. if err != nil {
  443. return fmt.Errorf("error on listing clusters: %v", err)
  444. }
  445. if len(lcr.Clusters) == 0 {
  446. if i < 10 {
  447. time.Sleep(200 * time.Millisecond)
  448. continue
  449. }
  450. return errors.New("empty list of clusters was returned")
  451. }
  452. cluster = lcr.Clusters[0]
  453. break
  454. }
  455. // In init, we take the initial default values from swarmkit, and merge
  456. // any non nil or 0 value from spec to GRPC spec. This will leave the
  457. // default value alone.
  458. // Note that this is different from Update(), as in Update() we expect
  459. // user to specify the complete spec of the cluster (as they already know
  460. // the existing one and knows which field to update)
  461. clusterSpec, err := convert.MergeSwarmSpecToGRPC(spec, cluster.Spec)
  462. if err != nil {
  463. return fmt.Errorf("error updating cluster settings: %v", err)
  464. }
  465. _, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{
  466. ClusterID: cluster.ID,
  467. ClusterVersion: &cluster.Meta.Version,
  468. Spec: &clusterSpec,
  469. })
  470. if err != nil {
  471. return fmt.Errorf("error updating cluster settings: %v", err)
  472. }
  473. return nil
  474. }
  475. }
  476. return ctx.Err()
  477. }
  478. func (c *Cluster) listContainerForNode(nodeID string) ([]string, error) {
  479. var ids []string
  480. filters := filters.NewArgs()
  481. filters.Add("label", fmt.Sprintf("com.docker.swarm.node.id=%s", nodeID))
  482. containers, err := c.config.Backend.Containers(&apitypes.ContainerListOptions{
  483. Filters: filters,
  484. })
  485. if err != nil {
  486. return []string{}, err
  487. }
  488. for _, c := range containers {
  489. ids = append(ids, c.ID)
  490. }
  491. return ids, nil
  492. }