123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407 |
- package cluster
- import (
- "encoding/json"
- "fmt"
- "io/ioutil"
- "net"
- "os"
- "path/filepath"
- "strings"
- "sync"
- "time"
- "google.golang.org/grpc"
- "github.com/Sirupsen/logrus"
- "github.com/docker/docker/daemon/cluster/convert"
- executorpkg "github.com/docker/docker/daemon/cluster/executor"
- "github.com/docker/docker/daemon/cluster/executor/container"
- "github.com/docker/docker/errors"
- "github.com/docker/docker/opts"
- "github.com/docker/docker/pkg/ioutils"
- "github.com/docker/docker/runconfig"
- apitypes "github.com/docker/engine-api/types"
- "github.com/docker/engine-api/types/filters"
- types "github.com/docker/engine-api/types/swarm"
- swarmagent "github.com/docker/swarmkit/agent"
- swarmapi "github.com/docker/swarmkit/api"
- "golang.org/x/net/context"
- )
- const swarmDirName = "swarm"
- const controlSocket = "control.sock"
- const swarmConnectTimeout = 20 * time.Second
- const swarmRequestTimeout = 20 * time.Second
- const stateFile = "docker-state.json"
- const defaultAddr = "0.0.0.0:2377"
- const (
- initialReconnectDelay = 100 * time.Millisecond
- maxReconnectDelay = 30 * time.Second
- )
- // ErrNoSwarm is returned on leaving a cluster that was never initialized
- var ErrNoSwarm = fmt.Errorf("This node is not part of swarm")
- // ErrSwarmExists is returned on initialize or join request for a cluster that has already been activated
- var ErrSwarmExists = fmt.Errorf("This node is already part of a swarm cluster. Use \"docker swarm leave\" to leave this cluster and join another one.")
- // ErrPendingSwarmExists is returned on initialize or join request for a cluster that is already processing a similar request but has not succeeded yet.
- var ErrPendingSwarmExists = fmt.Errorf("This node is processing an existing join request that has not succeeded yet. Use \"docker swarm leave\" to cancel the current request.")
- // ErrSwarmJoinTimeoutReached is returned when cluster join could not complete before timeout was reached.
- var ErrSwarmJoinTimeoutReached = fmt.Errorf("Timeout was reached before node was joined. Attempt to join the cluster will continue in the background. Use \"docker info\" command to see the current swarm status of your node.")
- // defaultSpec contains some sane defaults if cluster options are missing on init
- var defaultSpec = types.Spec{
- Raft: types.RaftConfig{
- SnapshotInterval: 10000,
- KeepOldSnapshots: 0,
- LogEntriesForSlowFollowers: 500,
- HeartbeatTick: 1,
- ElectionTick: 3,
- },
- CAConfig: types.CAConfig{
- NodeCertExpiry: 90 * 24 * time.Hour,
- },
- Dispatcher: types.DispatcherConfig{
- HeartbeatPeriod: uint64((5 * time.Second).Nanoseconds()),
- },
- Orchestration: types.OrchestrationConfig{
- TaskHistoryRetentionLimit: 10,
- },
- }
- type state struct {
- // LocalAddr is this machine's local IP or hostname, if specified.
- LocalAddr string
- // RemoteAddr is the address that was given to "swarm join. It is used
- // to find LocalAddr if necessary.
- RemoteAddr string
- // ListenAddr is the address we bind to, including a port.
- ListenAddr string
- // AdvertiseAddr is the address other nodes should connect to,
- // including a port.
- AdvertiseAddr string
- }
- // NetworkSubnetsProvider exposes functions for retrieving the subnets
- // of networks managed by Docker, so they can be filtered.
- type NetworkSubnetsProvider interface {
- V4Subnets() []net.IPNet
- V6Subnets() []net.IPNet
- }
- // Config provides values for Cluster.
- type Config struct {
- Root string
- Name string
- Backend executorpkg.Backend
- NetworkSubnetsProvider NetworkSubnetsProvider
- // DefaultAdvertiseAddr is the default host/IP or network interface to use
- // if no AdvertiseAddr value is specified.
- DefaultAdvertiseAddr string
- }
- // Cluster provides capabilities to participate in a cluster as a worker or a
- // manager.
- type Cluster struct {
- sync.RWMutex
- *node
- root string
- config Config
- configEvent chan struct{} // todo: make this array and goroutine safe
- localAddr string
- actualLocalAddr string // after resolution, not persisted
- remoteAddr string
- listenAddr string
- advertiseAddr string
- stop bool
- err error
- cancelDelay func()
- }
- type node struct {
- *swarmagent.Node
- done chan struct{}
- ready bool
- conn *grpc.ClientConn
- client swarmapi.ControlClient
- reconnectDelay time.Duration
- }
- // New creates a new Cluster instance using provided config.
- func New(config Config) (*Cluster, error) {
- root := filepath.Join(config.Root, swarmDirName)
- if err := os.MkdirAll(root, 0700); err != nil {
- return nil, err
- }
- c := &Cluster{
- root: root,
- config: config,
- configEvent: make(chan struct{}, 10),
- }
- st, err := c.loadState()
- if err != nil {
- if os.IsNotExist(err) {
- return c, nil
- }
- return nil, err
- }
- n, err := c.startNewNode(false, st.LocalAddr, st.RemoteAddr, st.ListenAddr, st.AdvertiseAddr, "", "")
- if err != nil {
- return nil, err
- }
- select {
- case <-time.After(swarmConnectTimeout):
- logrus.Errorf("swarm component could not be started before timeout was reached")
- case <-n.Ready():
- case <-n.done:
- return nil, fmt.Errorf("swarm component could not be started: %v", c.err)
- }
- go c.reconnectOnFailure(n)
- return c, nil
- }
- func (c *Cluster) loadState() (*state, error) {
- dt, err := ioutil.ReadFile(filepath.Join(c.root, stateFile))
- if err != nil {
- return nil, err
- }
- // missing certificate means no actual state to restore from
- if _, err := os.Stat(filepath.Join(c.root, "certificates/swarm-node.crt")); err != nil {
- if os.IsNotExist(err) {
- c.clearState()
- }
- return nil, err
- }
- var st state
- if err := json.Unmarshal(dt, &st); err != nil {
- return nil, err
- }
- return &st, nil
- }
- func (c *Cluster) saveState() error {
- dt, err := json.Marshal(state{
- LocalAddr: c.localAddr,
- RemoteAddr: c.remoteAddr,
- ListenAddr: c.listenAddr,
- AdvertiseAddr: c.advertiseAddr,
- })
- if err != nil {
- return err
- }
- return ioutils.AtomicWriteFile(filepath.Join(c.root, stateFile), dt, 0600)
- }
- func (c *Cluster) reconnectOnFailure(n *node) {
- for {
- <-n.done
- c.Lock()
- if c.stop || c.node != nil {
- c.Unlock()
- return
- }
- n.reconnectDelay *= 2
- if n.reconnectDelay > maxReconnectDelay {
- n.reconnectDelay = maxReconnectDelay
- }
- logrus.Warnf("Restarting swarm in %.2f seconds", n.reconnectDelay.Seconds())
- delayCtx, cancel := context.WithTimeout(context.Background(), n.reconnectDelay)
- c.cancelDelay = cancel
- c.Unlock()
- <-delayCtx.Done()
- if delayCtx.Err() != context.DeadlineExceeded {
- return
- }
- c.Lock()
- if c.node != nil {
- c.Unlock()
- return
- }
- var err error
- n, err = c.startNewNode(false, c.localAddr, c.getRemoteAddress(), c.listenAddr, c.advertiseAddr, c.getRemoteAddress(), "")
- if err != nil {
- c.err = err
- close(n.done)
- }
- c.Unlock()
- }
- }
- func (c *Cluster) startNewNode(forceNewCluster bool, localAddr, remoteAddr, listenAddr, advertiseAddr, joinAddr, joinToken string) (*node, error) {
- if err := c.config.Backend.IsSwarmCompatible(); err != nil {
- return nil, err
- }
- actualLocalAddr := localAddr
- if actualLocalAddr == "" {
- // If localAddr was not specified, resolve it automatically
- // based on the route to joinAddr. localAddr can only be left
- // empty on "join".
- listenHost, _, err := net.SplitHostPort(listenAddr)
- if err != nil {
- return nil, fmt.Errorf("could not parse listen address: %v", err)
- }
- listenAddrIP := net.ParseIP(listenHost)
- if listenAddrIP == nil || !listenAddrIP.IsUnspecified() {
- actualLocalAddr = listenHost
- } else {
- if remoteAddr == "" {
- // Should never happen except using swarms created by
- // old versions that didn't save remoteAddr.
- remoteAddr = "8.8.8.8:53"
- }
- conn, err := net.Dial("udp", remoteAddr)
- if err != nil {
- return nil, fmt.Errorf("could not find local IP address: %v", err)
- }
- localHostPort := conn.LocalAddr().String()
- actualLocalAddr, _, _ = net.SplitHostPort(localHostPort)
- conn.Close()
- }
- }
- c.node = nil
- c.cancelDelay = nil
- c.stop = false
- n, err := swarmagent.NewNode(&swarmagent.NodeConfig{
- Hostname: c.config.Name,
- ForceNewCluster: forceNewCluster,
- ListenControlAPI: filepath.Join(c.root, controlSocket),
- ListenRemoteAPI: listenAddr,
- AdvertiseRemoteAPI: advertiseAddr,
- JoinAddr: joinAddr,
- StateDir: c.root,
- JoinToken: joinToken,
- Executor: container.NewExecutor(c.config.Backend),
- HeartbeatTick: 1,
- ElectionTick: 3,
- })
- if err != nil {
- return nil, err
- }
- ctx := context.Background()
- if err := n.Start(ctx); err != nil {
- return nil, err
- }
- node := &node{
- Node: n,
- done: make(chan struct{}),
- reconnectDelay: initialReconnectDelay,
- }
- c.node = node
- c.localAddr = localAddr
- c.actualLocalAddr = actualLocalAddr // not saved
- c.remoteAddr = remoteAddr
- c.listenAddr = listenAddr
- c.advertiseAddr = advertiseAddr
- c.saveState()
- c.config.Backend.SetClusterProvider(c)
- go func() {
- err := n.Err(ctx)
- if err != nil {
- logrus.Errorf("cluster exited with error: %v", err)
- }
- c.Lock()
- c.node = nil
- c.err = err
- c.Unlock()
- close(node.done)
- }()
- go func() {
- select {
- case <-n.Ready():
- c.Lock()
- node.ready = true
- c.err = nil
- c.Unlock()
- case <-ctx.Done():
- }
- c.configEvent <- struct{}{}
- }()
- go func() {
- for conn := range n.ListenControlSocket(ctx) {
- c.Lock()
- if node.conn != conn {
- if conn == nil {
- node.client = nil
- } else {
- node.client = swarmapi.NewControlClient(conn)
- }
- }
- node.conn = conn
- c.Unlock()
- c.configEvent <- struct{}{}
- }
- }()
- return node, nil
- }
- // Init initializes new cluster from user provided request.
- func (c *Cluster) Init(req types.InitRequest) (string, error) {
- c.Lock()
- if node := c.node; node != nil {
- if !req.ForceNewCluster {
- c.Unlock()
- return "", ErrSwarmExists
- }
- if err := c.stopNode(); err != nil {
- c.Unlock()
- return "", err
- }
- }
- if err := validateAndSanitizeInitRequest(&req); err != nil {
- c.Unlock()
- return "", err
- }
- listenHost, listenPort, err := resolveListenAddr(req.ListenAddr)
- if err != nil {
- c.Unlock()
- return "", err
- }
- advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort)
- if err != nil {
- c.Unlock()
- return "", err
- }
- localAddr := listenHost
- // If the advertise address is not one of the system's
- // addresses, we also require a listen address.
- listenAddrIP := net.ParseIP(listenHost)
- if listenAddrIP != nil && listenAddrIP.IsUnspecified() {
- advertiseIP := net.ParseIP(advertiseHost)
- if advertiseIP == nil {
- // not an IP
- c.Unlock()
- return "", errMustSpecifyListenAddr
- }
- systemIPs := listSystemIPs()
- found := false
- for _, systemIP := range systemIPs {
- if systemIP.Equal(advertiseIP) {
- found = true
- break
- }
- }
- if !found {
- c.Unlock()
- return "", errMustSpecifyListenAddr
- }
- localAddr = advertiseIP.String()
- }
- // todo: check current state existing
- n, err := c.startNewNode(req.ForceNewCluster, localAddr, "", net.JoinHostPort(listenHost, listenPort), net.JoinHostPort(advertiseHost, advertisePort), "", "")
- if err != nil {
- c.Unlock()
- return "", err
- }
- c.Unlock()
- select {
- case <-n.Ready():
- if err := initClusterSpec(n, req.Spec); err != nil {
- return "", err
- }
- go c.reconnectOnFailure(n)
- return n.NodeID(), nil
- case <-n.done:
- c.RLock()
- defer c.RUnlock()
- if !req.ForceNewCluster { // if failure on first attempt don't keep state
- if err := c.clearState(); err != nil {
- return "", err
- }
- }
- return "", c.err
- }
- }
- // Join makes current Cluster part of an existing swarm cluster.
- func (c *Cluster) Join(req types.JoinRequest) error {
- c.Lock()
- if node := c.node; node != nil {
- c.Unlock()
- return ErrSwarmExists
- }
- if err := validateAndSanitizeJoinRequest(&req); err != nil {
- c.Unlock()
- return err
- }
- listenHost, listenPort, err := resolveListenAddr(req.ListenAddr)
- if err != nil {
- c.Unlock()
- return err
- }
- var advertiseAddr string
- advertiseHost, advertisePort, err := c.resolveAdvertiseAddr(req.AdvertiseAddr, listenPort)
- // For joining, we don't need to provide an advertise address,
- // since the remote side can detect it.
- if err == nil {
- advertiseAddr = net.JoinHostPort(advertiseHost, advertisePort)
- }
- // todo: check current state existing
- n, err := c.startNewNode(false, "", req.RemoteAddrs[0], net.JoinHostPort(listenHost, listenPort), advertiseAddr, req.RemoteAddrs[0], req.JoinToken)
- if err != nil {
- c.Unlock()
- return err
- }
- c.Unlock()
- select {
- case <-time.After(swarmConnectTimeout):
- // attempt to connect will continue in background, also reconnecting
- go c.reconnectOnFailure(n)
- return ErrSwarmJoinTimeoutReached
- case <-n.Ready():
- go c.reconnectOnFailure(n)
- return nil
- case <-n.done:
- c.RLock()
- defer c.RUnlock()
- return c.err
- }
- }
- // stopNode is a helper that stops the active c.node and waits until it has
- // shut down. Call while keeping the cluster lock.
- func (c *Cluster) stopNode() error {
- if c.node == nil {
- return nil
- }
- c.stop = true
- if c.cancelDelay != nil {
- c.cancelDelay()
- c.cancelDelay = nil
- }
- node := c.node
- ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
- defer cancel()
- // TODO: can't hold lock on stop because it calls back to network
- c.Unlock()
- defer c.Lock()
- if err := node.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
- return err
- }
- <-node.done
- return nil
- }
- // Leave shuts down Cluster and removes current state.
- func (c *Cluster) Leave(force bool) error {
- c.Lock()
- node := c.node
- if node == nil {
- c.Unlock()
- return ErrNoSwarm
- }
- if node.Manager() != nil && !force {
- msg := "You are attempting to leave cluster on a node that is participating as a manager. "
- if c.isActiveManager() {
- active, reachable, unreachable, err := c.managerStats()
- if err == nil {
- if active && reachable-2 <= unreachable {
- if reachable == 1 && unreachable == 0 {
- msg += "Removing the last manager will erase all current state of the cluster. Use `--force` to ignore this message. "
- c.Unlock()
- return fmt.Errorf(msg)
- }
- msg += fmt.Sprintf("Leaving the cluster will leave you with %v managers out of %v. This means Raft quorum will be lost and your cluster will become inaccessible. ", reachable-1, reachable+unreachable)
- }
- }
- } else {
- msg += "Doing so may lose the consensus of your cluster. "
- }
- msg += "The only way to restore a cluster that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to ignore this message."
- c.Unlock()
- return fmt.Errorf(msg)
- }
- if err := c.stopNode(); err != nil {
- c.Unlock()
- return err
- }
- c.Unlock()
- if nodeID := node.NodeID(); nodeID != "" {
- for _, id := range c.config.Backend.ListContainersForNode(nodeID) {
- if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil {
- logrus.Errorf("error removing %v: %v", id, err)
- }
- }
- }
- c.configEvent <- struct{}{}
- // todo: cleanup optional?
- if err := c.clearState(); err != nil {
- return err
- }
- return nil
- }
- func (c *Cluster) clearState() error {
- // todo: backup this data instead of removing?
- if err := os.RemoveAll(c.root); err != nil {
- return err
- }
- if err := os.MkdirAll(c.root, 0700); err != nil {
- return err
- }
- c.config.Backend.SetClusterProvider(nil)
- return nil
- }
- func (c *Cluster) getRequestContext() (context.Context, func()) { // TODO: not needed when requests don't block on qourum lost
- return context.WithTimeout(context.Background(), swarmRequestTimeout)
- }
- // Inspect retrieves the configuration properties of a managed swarm cluster.
- func (c *Cluster) Inspect() (types.Swarm, error) {
- c.RLock()
- defer c.RUnlock()
- if !c.isActiveManager() {
- return types.Swarm{}, c.errNoManager()
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- swarm, err := getSwarm(ctx, c.client)
- if err != nil {
- return types.Swarm{}, err
- }
- if err != nil {
- return types.Swarm{}, err
- }
- return convert.SwarmFromGRPC(*swarm), nil
- }
- // Update updates configuration of a managed swarm cluster.
- func (c *Cluster) Update(version uint64, spec types.Spec, flags types.UpdateFlags) error {
- c.RLock()
- defer c.RUnlock()
- if !c.isActiveManager() {
- return c.errNoManager()
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- swarm, err := getSwarm(ctx, c.client)
- if err != nil {
- return err
- }
- swarmSpec, err := convert.SwarmSpecToGRPC(spec)
- if err != nil {
- return err
- }
- _, err = c.client.UpdateCluster(
- ctx,
- &swarmapi.UpdateClusterRequest{
- ClusterID: swarm.ID,
- Spec: &swarmSpec,
- ClusterVersion: &swarmapi.Version{
- Index: version,
- },
- Rotation: swarmapi.JoinTokenRotation{
- RotateWorkerToken: flags.RotateWorkerToken,
- RotateManagerToken: flags.RotateManagerToken,
- },
- },
- )
- return err
- }
- // IsManager returns true if Cluster is participating as a manager.
- func (c *Cluster) IsManager() bool {
- c.RLock()
- defer c.RUnlock()
- return c.isActiveManager()
- }
- // IsAgent returns true if Cluster is participating as a worker/agent.
- func (c *Cluster) IsAgent() bool {
- c.RLock()
- defer c.RUnlock()
- return c.node != nil && c.ready
- }
- // GetLocalAddress returns the local address.
- func (c *Cluster) GetLocalAddress() string {
- c.RLock()
- defer c.RUnlock()
- return c.actualLocalAddr
- }
- // GetAdvertiseAddress returns the remotely reachable address of this node.
- func (c *Cluster) GetAdvertiseAddress() string {
- c.RLock()
- defer c.RUnlock()
- if c.advertiseAddr != "" {
- advertiseHost, _, _ := net.SplitHostPort(c.advertiseAddr)
- return advertiseHost
- }
- return c.actualLocalAddr
- }
- // GetRemoteAddress returns a known advertise address of a remote manager if
- // available.
- // todo: change to array/connect with info
- func (c *Cluster) GetRemoteAddress() string {
- c.RLock()
- defer c.RUnlock()
- return c.getRemoteAddress()
- }
- func (c *Cluster) getRemoteAddress() string {
- if c.node == nil {
- return ""
- }
- nodeID := c.node.NodeID()
- for _, r := range c.node.Remotes() {
- if r.NodeID != nodeID {
- return r.Addr
- }
- }
- return ""
- }
- // ListenClusterEvents returns a channel that receives messages on cluster
- // participation changes.
- // todo: make cancelable and accessible to multiple callers
- func (c *Cluster) ListenClusterEvents() <-chan struct{} {
- return c.configEvent
- }
- // Info returns information about the current cluster state.
- func (c *Cluster) Info() types.Info {
- info := types.Info{
- NodeAddr: c.GetAdvertiseAddress(),
- }
- c.RLock()
- defer c.RUnlock()
- if c.node == nil {
- info.LocalNodeState = types.LocalNodeStateInactive
- if c.cancelDelay != nil {
- info.LocalNodeState = types.LocalNodeStateError
- }
- } else {
- info.LocalNodeState = types.LocalNodeStatePending
- if c.ready == true {
- info.LocalNodeState = types.LocalNodeStateActive
- }
- }
- if c.err != nil {
- info.Error = c.err.Error()
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- if c.isActiveManager() {
- info.ControlAvailable = true
- swarm, err := c.Inspect()
- if err != nil {
- info.Error = err.Error()
- }
- info.Cluster = swarm
- if r, err := c.client.ListNodes(ctx, &swarmapi.ListNodesRequest{}); err == nil {
- info.Nodes = len(r.Nodes)
- for _, n := range r.Nodes {
- if n.ManagerStatus != nil {
- info.Managers = info.Managers + 1
- }
- }
- }
- }
- if c.node != nil {
- for _, r := range c.node.Remotes() {
- info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr})
- }
- info.NodeID = c.node.NodeID()
- }
- return info
- }
- // isActiveManager should not be called without a read lock
- func (c *Cluster) isActiveManager() bool {
- return c.node != nil && c.conn != nil
- }
- // errNoManager returns error describing why manager commands can't be used.
- // Call with read lock.
- func (c *Cluster) errNoManager() error {
- if c.node == nil {
- return fmt.Errorf("This node is not a swarm manager. Use \"docker swarm init\" or \"docker swarm join\" to connect this node to swarm and try again.")
- }
- if c.node.Manager() != nil {
- return fmt.Errorf("This node is not a swarm manager. Manager is being prepared or has trouble connecting to the cluster.")
- }
- return fmt.Errorf("This node is not a swarm manager. Worker nodes can't be used to view or modify cluster state. Please run this command on a manager node or promote the current node to a manager.")
- }
- // GetServices returns all services of a managed swarm cluster.
- func (c *Cluster) GetServices(options apitypes.ServiceListOptions) ([]types.Service, error) {
- c.RLock()
- defer c.RUnlock()
- if !c.isActiveManager() {
- return nil, c.errNoManager()
- }
- filters, err := newListServicesFilters(options.Filter)
- if err != nil {
- return nil, err
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- r, err := c.client.ListServices(
- ctx,
- &swarmapi.ListServicesRequest{Filters: filters})
- if err != nil {
- return nil, err
- }
- services := []types.Service{}
- for _, service := range r.Services {
- services = append(services, convert.ServiceFromGRPC(*service))
- }
- return services, nil
- }
- // CreateService creates a new service in a managed swarm cluster.
- func (c *Cluster) CreateService(s types.ServiceSpec, encodedAuth string) (string, error) {
- c.RLock()
- defer c.RUnlock()
- if !c.isActiveManager() {
- return "", c.errNoManager()
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- err := c.populateNetworkID(ctx, c.client, &s)
- if err != nil {
- return "", err
- }
- serviceSpec, err := convert.ServiceSpecToGRPC(s)
- if err != nil {
- return "", err
- }
- if encodedAuth != "" {
- ctnr := serviceSpec.Task.GetContainer()
- if ctnr == nil {
- return "", fmt.Errorf("service does not use container tasks")
- }
- ctnr.PullOptions = &swarmapi.ContainerSpec_PullOptions{RegistryAuth: encodedAuth}
- }
- r, err := c.client.CreateService(ctx, &swarmapi.CreateServiceRequest{Spec: &serviceSpec})
- if err != nil {
- return "", err
- }
- return r.Service.ID, nil
- }
- // GetService returns a service based on an ID or name.
- func (c *Cluster) GetService(input string) (types.Service, error) {
- c.RLock()
- defer c.RUnlock()
- if !c.isActiveManager() {
- return types.Service{}, c.errNoManager()
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- service, err := getService(ctx, c.client, input)
- if err != nil {
- return types.Service{}, err
- }
- return convert.ServiceFromGRPC(*service), nil
- }
- // UpdateService updates existing service to match new properties.
- func (c *Cluster) UpdateService(serviceID string, version uint64, spec types.ServiceSpec, encodedAuth string) error {
- c.RLock()
- defer c.RUnlock()
- if !c.isActiveManager() {
- return c.errNoManager()
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- err := c.populateNetworkID(ctx, c.client, &spec)
- if err != nil {
- return err
- }
- serviceSpec, err := convert.ServiceSpecToGRPC(spec)
- if err != nil {
- return err
- }
- if encodedAuth != "" {
- ctnr := serviceSpec.Task.GetContainer()
- if ctnr == nil {
- return fmt.Errorf("service does not use container tasks")
- }
- ctnr.PullOptions = &swarmapi.ContainerSpec_PullOptions{RegistryAuth: encodedAuth}
- } else {
- // this is needed because if the encodedAuth isn't being updated then we
- // shouldn't lose it, and continue to use the one that was already present
- currentService, err := getService(ctx, c.client, serviceID)
- if err != nil {
- return err
- }
- ctnr := currentService.Spec.Task.GetContainer()
- if ctnr == nil {
- return fmt.Errorf("service does not use container tasks")
- }
- serviceSpec.Task.GetContainer().PullOptions = ctnr.PullOptions
- }
- _, err = c.client.UpdateService(
- ctx,
- &swarmapi.UpdateServiceRequest{
- ServiceID: serviceID,
- Spec: &serviceSpec,
- ServiceVersion: &swarmapi.Version{
- Index: version,
- },
- },
- )
- return err
- }
- // RemoveService removes a service from a managed swarm cluster.
- func (c *Cluster) RemoveService(input string) error {
- c.RLock()
- defer c.RUnlock()
- if !c.isActiveManager() {
- return c.errNoManager()
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- service, err := getService(ctx, c.client, input)
- if err != nil {
- return err
- }
- if _, err := c.client.RemoveService(ctx, &swarmapi.RemoveServiceRequest{ServiceID: service.ID}); err != nil {
- return err
- }
- return nil
- }
- // GetNodes returns a list of all nodes known to a cluster.
- func (c *Cluster) GetNodes(options apitypes.NodeListOptions) ([]types.Node, error) {
- c.RLock()
- defer c.RUnlock()
- if !c.isActiveManager() {
- return nil, c.errNoManager()
- }
- filters, err := newListNodesFilters(options.Filter)
- if err != nil {
- return nil, err
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- r, err := c.client.ListNodes(
- ctx,
- &swarmapi.ListNodesRequest{Filters: filters})
- if err != nil {
- return nil, err
- }
- nodes := []types.Node{}
- for _, node := range r.Nodes {
- nodes = append(nodes, convert.NodeFromGRPC(*node))
- }
- return nodes, nil
- }
- // GetNode returns a node based on an ID or name.
- func (c *Cluster) GetNode(input string) (types.Node, error) {
- c.RLock()
- defer c.RUnlock()
- if !c.isActiveManager() {
- return types.Node{}, c.errNoManager()
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- node, err := getNode(ctx, c.client, input)
- if err != nil {
- return types.Node{}, err
- }
- return convert.NodeFromGRPC(*node), nil
- }
- // UpdateNode updates existing nodes properties.
- func (c *Cluster) UpdateNode(nodeID string, version uint64, spec types.NodeSpec) error {
- c.RLock()
- defer c.RUnlock()
- if !c.isActiveManager() {
- return c.errNoManager()
- }
- nodeSpec, err := convert.NodeSpecToGRPC(spec)
- if err != nil {
- return err
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- _, err = c.client.UpdateNode(
- ctx,
- &swarmapi.UpdateNodeRequest{
- NodeID: nodeID,
- Spec: &nodeSpec,
- NodeVersion: &swarmapi.Version{
- Index: version,
- },
- },
- )
- return err
- }
- // RemoveNode removes a node from a cluster
- func (c *Cluster) RemoveNode(input string) error {
- c.RLock()
- defer c.RUnlock()
- if !c.isActiveManager() {
- return c.errNoManager()
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- node, err := getNode(ctx, c.client, input)
- if err != nil {
- return err
- }
- if _, err := c.client.RemoveNode(ctx, &swarmapi.RemoveNodeRequest{NodeID: node.ID}); err != nil {
- return err
- }
- return nil
- }
- // GetTasks returns a list of tasks matching the filter options.
- func (c *Cluster) GetTasks(options apitypes.TaskListOptions) ([]types.Task, error) {
- c.RLock()
- defer c.RUnlock()
- if !c.isActiveManager() {
- return nil, c.errNoManager()
- }
- byName := func(filter filters.Args) error {
- if filter.Include("service") {
- serviceFilters := filter.Get("service")
- for _, serviceFilter := range serviceFilters {
- service, err := c.GetService(serviceFilter)
- if err != nil {
- return err
- }
- filter.Del("service", serviceFilter)
- filter.Add("service", service.ID)
- }
- }
- if filter.Include("node") {
- nodeFilters := filter.Get("node")
- for _, nodeFilter := range nodeFilters {
- node, err := c.GetNode(nodeFilter)
- if err != nil {
- return err
- }
- filter.Del("node", nodeFilter)
- filter.Add("node", node.ID)
- }
- }
- return nil
- }
- filters, err := newListTasksFilters(options.Filter, byName)
- if err != nil {
- return nil, err
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- r, err := c.client.ListTasks(
- ctx,
- &swarmapi.ListTasksRequest{Filters: filters})
- if err != nil {
- return nil, err
- }
- tasks := []types.Task{}
- for _, task := range r.Tasks {
- tasks = append(tasks, convert.TaskFromGRPC(*task))
- }
- return tasks, nil
- }
- // GetTask returns a task by an ID.
- func (c *Cluster) GetTask(input string) (types.Task, error) {
- c.RLock()
- defer c.RUnlock()
- if !c.isActiveManager() {
- return types.Task{}, c.errNoManager()
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- task, err := getTask(ctx, c.client, input)
- if err != nil {
- return types.Task{}, err
- }
- return convert.TaskFromGRPC(*task), nil
- }
- // GetNetwork returns a cluster network by an ID.
- func (c *Cluster) GetNetwork(input string) (apitypes.NetworkResource, error) {
- c.RLock()
- defer c.RUnlock()
- if !c.isActiveManager() {
- return apitypes.NetworkResource{}, c.errNoManager()
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- network, err := getNetwork(ctx, c.client, input)
- if err != nil {
- return apitypes.NetworkResource{}, err
- }
- return convert.BasicNetworkFromGRPC(*network), nil
- }
- // GetNetworks returns all current cluster managed networks.
- func (c *Cluster) GetNetworks() ([]apitypes.NetworkResource, error) {
- c.RLock()
- defer c.RUnlock()
- if !c.isActiveManager() {
- return nil, c.errNoManager()
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- r, err := c.client.ListNetworks(ctx, &swarmapi.ListNetworksRequest{})
- if err != nil {
- return nil, err
- }
- var networks []apitypes.NetworkResource
- for _, network := range r.Networks {
- networks = append(networks, convert.BasicNetworkFromGRPC(*network))
- }
- return networks, nil
- }
- // CreateNetwork creates a new cluster managed network.
- func (c *Cluster) CreateNetwork(s apitypes.NetworkCreateRequest) (string, error) {
- c.RLock()
- defer c.RUnlock()
- if !c.isActiveManager() {
- return "", c.errNoManager()
- }
- if runconfig.IsPreDefinedNetwork(s.Name) {
- err := fmt.Errorf("%s is a pre-defined network and cannot be created", s.Name)
- return "", errors.NewRequestForbiddenError(err)
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- networkSpec := convert.BasicNetworkCreateToGRPC(s)
- r, err := c.client.CreateNetwork(ctx, &swarmapi.CreateNetworkRequest{Spec: &networkSpec})
- if err != nil {
- return "", err
- }
- return r.Network.ID, nil
- }
- // RemoveNetwork removes a cluster network.
- func (c *Cluster) RemoveNetwork(input string) error {
- c.RLock()
- defer c.RUnlock()
- if !c.isActiveManager() {
- return c.errNoManager()
- }
- ctx, cancel := c.getRequestContext()
- defer cancel()
- network, err := getNetwork(ctx, c.client, input)
- if err != nil {
- return err
- }
- if _, err := c.client.RemoveNetwork(ctx, &swarmapi.RemoveNetworkRequest{NetworkID: network.ID}); err != nil {
- return err
- }
- return nil
- }
- func (c *Cluster) populateNetworkID(ctx context.Context, client swarmapi.ControlClient, s *types.ServiceSpec) error {
- for i, n := range s.Networks {
- apiNetwork, err := getNetwork(ctx, client, n.Target)
- if err != nil {
- if ln, _ := c.config.Backend.FindNetwork(n.Target); ln != nil && !ln.Info().Dynamic() {
- err = fmt.Errorf("network %s is not eligible for docker services", ln.Name())
- return errors.NewRequestForbiddenError(err)
- }
- return err
- }
- s.Networks[i].Target = apiNetwork.ID
- }
- return nil
- }
- func getNetwork(ctx context.Context, c swarmapi.ControlClient, input string) (*swarmapi.Network, error) {
- // GetNetwork to match via full ID.
- rg, err := c.GetNetwork(ctx, &swarmapi.GetNetworkRequest{NetworkID: input})
- if err != nil {
- // If any error (including NotFound), ListNetworks to match via ID prefix and full name.
- rl, err := c.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: &swarmapi.ListNetworksRequest_Filters{Names: []string{input}}})
- if err != nil || len(rl.Networks) == 0 {
- rl, err = c.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: &swarmapi.ListNetworksRequest_Filters{IDPrefixes: []string{input}}})
- }
- if err != nil {
- return nil, err
- }
- if len(rl.Networks) == 0 {
- return nil, fmt.Errorf("network %s not found", input)
- }
- if l := len(rl.Networks); l > 1 {
- return nil, fmt.Errorf("network %s is ambiguous (%d matches found)", input, l)
- }
- return rl.Networks[0], nil
- }
- return rg.Network, nil
- }
- // Cleanup stops active swarm node. This is run before daemon shutdown.
- func (c *Cluster) Cleanup() {
- c.Lock()
- node := c.node
- if node == nil {
- c.Unlock()
- return
- }
- defer c.Unlock()
- if c.isActiveManager() {
- active, reachable, unreachable, err := c.managerStats()
- if err == nil {
- singlenode := active && reachable == 1 && unreachable == 0
- if active && !singlenode && reachable-2 <= unreachable {
- logrus.Errorf("Leaving cluster with %v managers left out of %v. Raft quorum will be lost.", reachable-1, reachable+unreachable)
- }
- }
- }
- c.stopNode()
- }
- func (c *Cluster) managerStats() (current bool, reachable int, unreachable int, err error) {
- ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
- defer cancel()
- nodes, err := c.client.ListNodes(ctx, &swarmapi.ListNodesRequest{})
- if err != nil {
- return false, 0, 0, err
- }
- for _, n := range nodes.Nodes {
- if n.ManagerStatus != nil {
- if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_REACHABLE {
- reachable++
- if n.ID == c.node.NodeID() {
- current = true
- }
- }
- if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_UNREACHABLE {
- unreachable++
- }
- }
- }
- return
- }
- func validateAndSanitizeInitRequest(req *types.InitRequest) error {
- var err error
- req.ListenAddr, err = validateAddr(req.ListenAddr)
- if err != nil {
- return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
- }
- spec := &req.Spec
- // provide sane defaults instead of erroring
- if spec.Name == "" {
- spec.Name = "default"
- }
- if spec.Raft.SnapshotInterval == 0 {
- spec.Raft.SnapshotInterval = defaultSpec.Raft.SnapshotInterval
- }
- if spec.Raft.LogEntriesForSlowFollowers == 0 {
- spec.Raft.LogEntriesForSlowFollowers = defaultSpec.Raft.LogEntriesForSlowFollowers
- }
- if spec.Raft.ElectionTick == 0 {
- spec.Raft.ElectionTick = defaultSpec.Raft.ElectionTick
- }
- if spec.Raft.HeartbeatTick == 0 {
- spec.Raft.HeartbeatTick = defaultSpec.Raft.HeartbeatTick
- }
- if spec.Dispatcher.HeartbeatPeriod == 0 {
- spec.Dispatcher.HeartbeatPeriod = defaultSpec.Dispatcher.HeartbeatPeriod
- }
- if spec.CAConfig.NodeCertExpiry == 0 {
- spec.CAConfig.NodeCertExpiry = defaultSpec.CAConfig.NodeCertExpiry
- }
- if spec.Orchestration.TaskHistoryRetentionLimit == 0 {
- spec.Orchestration.TaskHistoryRetentionLimit = defaultSpec.Orchestration.TaskHistoryRetentionLimit
- }
- return nil
- }
- func validateAndSanitizeJoinRequest(req *types.JoinRequest) error {
- var err error
- req.ListenAddr, err = validateAddr(req.ListenAddr)
- if err != nil {
- return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
- }
- if len(req.RemoteAddrs) == 0 {
- return fmt.Errorf("at least 1 RemoteAddr is required to join")
- }
- for i := range req.RemoteAddrs {
- req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i])
- if err != nil {
- return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err)
- }
- }
- return nil
- }
- func validateAddr(addr string) (string, error) {
- if addr == "" {
- return addr, fmt.Errorf("invalid empty address")
- }
- newaddr, err := opts.ParseTCPAddr(addr, defaultAddr)
- if err != nil {
- return addr, nil
- }
- return strings.TrimPrefix(newaddr, "tcp://"), nil
- }
- func initClusterSpec(node *node, spec types.Spec) error {
- ctx, _ := context.WithTimeout(context.Background(), 5*time.Second)
- for conn := range node.ListenControlSocket(ctx) {
- if ctx.Err() != nil {
- return ctx.Err()
- }
- if conn != nil {
- client := swarmapi.NewControlClient(conn)
- var cluster *swarmapi.Cluster
- for i := 0; ; i++ {
- lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{})
- if err != nil {
- return fmt.Errorf("error on listing clusters: %v", err)
- }
- if len(lcr.Clusters) == 0 {
- if i < 10 {
- time.Sleep(200 * time.Millisecond)
- continue
- }
- return fmt.Errorf("empty list of clusters was returned")
- }
- cluster = lcr.Clusters[0]
- break
- }
- newspec, err := convert.SwarmSpecToGRPC(spec)
- if err != nil {
- return fmt.Errorf("error updating cluster settings: %v", err)
- }
- _, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{
- ClusterID: cluster.ID,
- ClusterVersion: &cluster.Meta.Version,
- Spec: &newspec,
- })
- if err != nil {
- return fmt.Errorf("error updating cluster settings: %v", err)
- }
- return nil
- }
- }
- return ctx.Err()
- }
|