Bladeren bron

Merge pull request #17191 from mrjana/restart

Fix docker startup failure due to dangling endpoints
Tibor Vass 9 jaren geleden
bovenliggende
commit
566964bf73

+ 38 - 36
daemon/container_unix.go

@@ -910,6 +910,8 @@ func (container *Container) ConnectToNetwork(idOrName string) error {
 }
 
 func (container *Container) connectToNetwork(idOrName string, updateSettings bool) error {
+	var err error
+
 	if container.hostConfig.NetworkMode.IsContainer() {
 		return runconfig.ErrConflictSharedNetwork
 	}
@@ -934,22 +936,31 @@ func (container *Container) connectToNetwork(idOrName string, updateSettings boo
 	}
 
 	ep, err := container.getEndpointInNetwork(n)
-	if err != nil {
-		if _, ok := err.(libnetwork.ErrNoSuchEndpoint); !ok {
-			return err
-		}
+	if err == nil {
+		return fmt.Errorf("container already connected to network %s", idOrName)
+	}
 
-		createOptions, err := container.buildCreateEndpointOptions()
-		if err != nil {
-			return err
-		}
+	if _, ok := err.(libnetwork.ErrNoSuchEndpoint); !ok {
+		return err
+	}
+
+	createOptions, err := container.buildCreateEndpointOptions()
+	if err != nil {
+		return err
+	}
 
-		endpointName := strings.TrimPrefix(container.Name, "/")
-		ep, err = n.CreateEndpoint(endpointName, createOptions...)
+	endpointName := strings.TrimPrefix(container.Name, "/")
+	ep, err = n.CreateEndpoint(endpointName, createOptions...)
+	if err != nil {
+		return err
+	}
+	defer func() {
 		if err != nil {
-			return err
+			if e := ep.Delete(); e != nil {
+				logrus.Warnf("Could not rollback container connection to network %s", idOrName)
+			}
 		}
-	}
+	}()
 
 	if err := container.updateEndpointNetworkSettings(n, ep); err != nil {
 		return err
@@ -972,9 +983,9 @@ func (container *Container) connectToNetwork(idOrName string, updateSettings boo
 		if err != nil {
 			return err
 		}
-	}
 
-	container.updateSandboxNetworkSettings(sb)
+		container.updateSandboxNetworkSettings(sb)
+	}
 
 	if err := ep.Join(sb); err != nil {
 		return err
@@ -1120,14 +1131,6 @@ func (container *Container) releaseNetwork() {
 		return
 	}
 
-	for _, ns := range networks {
-		n, err := container.daemon.FindNetwork(ns)
-		if err != nil {
-			continue
-		}
-		container.disconnectFromNetwork(n, false)
-	}
-
 	if err := sb.Delete(); err != nil {
 		logrus.Errorf("Error deleting sandbox id %s for container %s: %v", sid, container.ID, err)
 	}
@@ -1139,10 +1142,10 @@ func (container *Container) DisconnectFromNetwork(n libnetwork.Network) error {
 		return derr.ErrorCodeNotRunning.WithArgs(container.ID)
 	}
 
-	return container.disconnectFromNetwork(n, true)
+	return container.disconnectFromNetwork(n)
 }
 
-func (container *Container) disconnectFromNetwork(n libnetwork.Network, updateSettings bool) error {
+func (container *Container) disconnectFromNetwork(n libnetwork.Network) error {
 	var (
 		ep   libnetwork.Endpoint
 		sbox libnetwork.Sandbox
@@ -1172,20 +1175,19 @@ func (container *Container) disconnectFromNetwork(n libnetwork.Network, updateSe
 		return fmt.Errorf("endpoint delete failed for container %s on network %s: %v", container.ID, n.Name(), err)
 	}
 
-	if updateSettings {
-		networks := container.NetworkSettings.Networks
-		for i, s := range networks {
-			sn, err := container.daemon.FindNetwork(s)
-			if err != nil {
-				continue
-			}
-			if sn.Name() == n.Name() {
-				networks = append(networks[:i], networks[i+1:]...)
-				container.NetworkSettings.Networks = networks
-				break
-			}
+	networks := container.NetworkSettings.Networks
+	for i, s := range networks {
+		sn, err := container.daemon.FindNetwork(s)
+		if err != nil {
+			continue
+		}
+		if sn.Name() == n.Name() {
+			networks = append(networks[:i], networks[i+1:]...)
+			container.NetworkSettings.Networks = networks
+			break
 		}
 	}
+
 	return nil
 }
 

+ 1 - 1
hack/vendor.sh

@@ -21,7 +21,7 @@ clone git github.com/vdemeester/shakers 3c10293ce22b900c27acad7b28656196fcc2f73b
 clone git golang.org/x/net 3cffabab72adf04f8e3b01c5baf775361837b5fe https://github.com/golang/net.git
 
 #get libnetwork packages
-clone git github.com/docker/libnetwork 2934f6bf585fa24c86048cc85f7506a5bb626bf5
+clone git github.com/docker/libnetwork fc6cbea49cd8197c0a8d22b9e8f24f37d9e7b1b8
 clone git github.com/armon/go-metrics eb0af217e5e9747e41dd5303755356b62d28e3ec
 clone git github.com/hashicorp/go-msgpack 71c2886f5a673a35f909803f38ece5810165097b
 clone git github.com/hashicorp/memberlist 9a1e242e454d2443df330bdd51a436d5a9058fc4

+ 1 - 1
vendor/src/github.com/docker/libnetwork/Makefile

@@ -1,6 +1,6 @@
 .PHONY: all all-local build build-local check check-code check-format run-tests check-local integration-tests install-deps coveralls circle-ci start-services clean
 SHELL=/bin/bash
-build_image=libnetwork-build
+build_image=libnetworkbuild
 dockerargs = --privileged -v $(shell pwd):/go/src/github.com/docker/libnetwork -w /go/src/github.com/docker/libnetwork
 container_env = -e "INSIDECONTAINER=-incontainer=true"
 docker = docker run --rm -it ${dockerargs} ${container_env} ${build_image}

+ 2 - 1
vendor/src/github.com/docker/libnetwork/controller.go

@@ -192,6 +192,7 @@ func New(cfgOptions ...config.Option) (NetworkController, error) {
 	}
 
 	c.sandboxCleanup()
+	c.cleanupLocalEndpoints()
 
 	if err := c.startExternalKeyListener(); err != nil {
 		return nil, err
@@ -356,7 +357,7 @@ func (c *controller) NewNetwork(networkType, name string, options ...NetworkOpti
 		}
 	}()
 
-	if err := c.addNetwork(network); err != nil {
+	if err = c.addNetwork(network); err != nil {
 		return nil, err
 	}
 	defer func() {

+ 12 - 2
vendor/src/github.com/docker/libnetwork/datastore/datastore.go

@@ -188,7 +188,7 @@ func ParseKey(key string) ([]string, error) {
 }
 
 // newClient used to connect to KV Store
-func newClient(scope string, kv string, addrs string, config *store.Config, cached bool) (DataStore, error) {
+func newClient(scope string, kv string, addr string, config *store.Config, cached bool) (DataStore, error) {
 	if cached && scope != LocalScope {
 		return nil, fmt.Errorf("caching supported only for scope %s", LocalScope)
 	}
@@ -196,7 +196,10 @@ func newClient(scope string, kv string, addrs string, config *store.Config, cach
 	if config == nil {
 		config = &store.Config{}
 	}
-	store, err := libkv.NewStore(store.Backend(kv), []string{addrs}, config)
+
+	addrs := strings.Split(addr, ",")
+
+	store, err := libkv.NewStore(store.Backend(kv), addrs, config)
 	if err != nil {
 		return nil, err
 	}
@@ -262,6 +265,13 @@ func (ds *datastore) Watch(kvObject KVObject, stopCh <-chan struct{}) (<-chan KV
 				close(sCh)
 				return
 			case kvPair := <-kvpCh:
+				// If the backend KV store gets reset libkv's go routine
+				// for the watch can exit resulting in a nil value in
+				// channel.
+				if kvPair == nil {
+					close(sCh)
+					return
+				}
 				dstO := ctor.New()
 
 				if err := dstO.SetValue(kvPair.Value); err != nil {

+ 2 - 2
vendor/src/github.com/docker/libnetwork/drivers/bridge/bridge.go

@@ -989,7 +989,7 @@ func (d *driver) DeleteEndpoint(nid, eid string) error {
 	d.Unlock()
 
 	if !ok {
-		return types.NotFoundErrorf("network %s does not exist", nid)
+		return types.InternalMaskableErrorf("network %s does not exist", nid)
 	}
 	if n == nil {
 		return driverapi.ErrNoNetwork(nid)
@@ -1145,7 +1145,7 @@ func (d *driver) Leave(nid, eid string) error {
 
 	network, err := d.getNetwork(nid)
 	if err != nil {
-		return err
+		return types.InternalMaskableErrorf("%s", err)
 	}
 
 	endpoint, err := network.getEndpoint(eid)

+ 3 - 2
vendor/src/github.com/docker/libnetwork/drivers/overlay/ov_network.go

@@ -179,6 +179,7 @@ func (n *network) destroySandbox() {
 			}
 		}
 		sbox.Destroy()
+		n.setSandbox(nil)
 	}
 }
 
@@ -193,7 +194,7 @@ func (n *network) initSubnetSandbox(s *subnet) error {
 	if err := sbox.AddInterface(brName, "br",
 		sbox.InterfaceOptions().Address(s.gwIP),
 		sbox.InterfaceOptions().Bridge(true)); err != nil {
-		return fmt.Errorf("bridge creation in sandbox failed for subnet %q: %v", s.subnetIP.IP.String(), err)
+		return fmt.Errorf("bridge creation in sandbox failed for subnet %q: %v", s.subnetIP.String(), err)
 	}
 
 	vxlanName, err := createVxlan(n.vxlanID(s))
@@ -203,7 +204,7 @@ func (n *network) initSubnetSandbox(s *subnet) error {
 
 	if err := sbox.AddInterface(vxlanName, "vxlan",
 		sbox.InterfaceOptions().Master(brName)); err != nil {
-		return fmt.Errorf("vxlan interface creation failed for subnet %q: %v", s.subnetIP.IP.String(), err)
+		return fmt.Errorf("vxlan interface creation failed for subnet %q: %v", s.subnetIP.String(), err)
 	}
 
 	n.Lock()

+ 43 - 14
vendor/src/github.com/docker/libnetwork/endpoint.go

@@ -425,28 +425,32 @@ func (ep *endpoint) sbLeave(sbox Sandbox, options ...EndpointOption) error {
 
 	ep.processOptions(options...)
 
-	ep.Lock()
-	ep.sandboxID = ""
-	ep.network = n
-	ep.Unlock()
-
-	if err := n.getController().updateToStore(ep); err != nil {
-		ep.Lock()
-		ep.sandboxID = sid
-		ep.Unlock()
-		return err
-	}
-
 	d, err := n.driver()
 	if err != nil {
 		return fmt.Errorf("failed to leave endpoint: %v", err)
 	}
 
+	ep.Lock()
+	ep.sandboxID = ""
+	ep.network = n
+	ep.Unlock()
+
 	if err := d.Leave(n.id, ep.id); err != nil {
-		return err
+		if _, ok := err.(types.MaskableError); !ok {
+			log.Warnf("driver error disconnecting container %s : %v", ep.name, err)
+		}
 	}
 
 	if err := sb.clearNetworkResources(ep); err != nil {
+		log.Warnf("Could not cleanup network resources on container %s disconnect: %v", ep.name, err)
+	}
+
+	// Update the store about the sandbox detach only after we
+	// have completed sb.clearNetworkresources above to avoid
+	// spurious logs when cleaning up the sandbox when the daemon
+	// ungracefully exits and restarts before completing sandbox
+	// detach but after store has been updated.
+	if err := n.getController().updateToStore(ep); err != nil {
 		return err
 	}
 
@@ -532,7 +536,10 @@ func (ep *endpoint) deleteEndpoint() error {
 		if _, ok := err.(types.ForbiddenError); ok {
 			return err
 		}
-		log.Warnf("driver error deleting endpoint %s : %v", name, err)
+
+		if _, ok := err.(types.MaskableError); !ok {
+			log.Warnf("driver error deleting endpoint %s : %v", name, err)
+		}
 	}
 
 	return nil
@@ -704,3 +711,25 @@ func (ep *endpoint) releaseAddress() {
 		}
 	}
 }
+
+func (c *controller) cleanupLocalEndpoints() {
+	nl, err := c.getNetworksForScope(datastore.LocalScope)
+	if err != nil {
+		log.Warnf("Could not get list of networks during endpoint cleanup: %v", err)
+		return
+	}
+
+	for _, n := range nl {
+		epl, err := n.getEndpointsFromStore()
+		if err != nil {
+			log.Warnf("Could not get list of endpoints in network %s during endpoint cleanup: %v", n.name, err)
+			continue
+		}
+
+		for _, ep := range epl {
+			if err := ep.Delete(); err != nil {
+				log.Warnf("Could not delete local endpoint %s during endpoint cleanup: %v", ep.name, err)
+			}
+		}
+	}
+}

+ 3 - 0
vendor/src/github.com/docker/libnetwork/ipamutils/utils_linux.go

@@ -6,6 +6,7 @@ import (
 	"net"
 
 	"github.com/docker/libnetwork/netutils"
+	"github.com/docker/libnetwork/osl"
 	"github.com/docker/libnetwork/resolvconf"
 	"github.com/vishvananda/netlink"
 )
@@ -21,6 +22,8 @@ func ElectInterfaceAddresses(name string) (*net.IPNet, []*net.IPNet, error) {
 		err    error
 	)
 
+	defer osl.InitOSContext()()
+
 	link, _ := netlink.LinkByName(name)
 	if link != nil {
 		v4addr, err := netlink.AddrList(link, netlink.FAMILY_V4)

+ 2 - 2
vendor/src/github.com/docker/libnetwork/netutils/utils.go

@@ -161,8 +161,8 @@ func GenerateIfaceName(prefix string, len int) (string, error) {
 		if err != nil {
 			continue
 		}
-		if _, err := net.InterfaceByName(name); err != nil {
-			if strings.Contains(err.Error(), "no such") {
+		if _, err := netlink.LinkByName(name); err != nil {
+			if strings.Contains(err.Error(), "not found") {
 				return name, nil
 			}
 			return "", err

+ 42 - 2
vendor/src/github.com/docker/libnetwork/sandbox.go

@@ -66,6 +66,7 @@ type sandbox struct {
 	joinLeaveDone chan struct{}
 	dbIndex       uint64
 	dbExists      bool
+	inDelete      bool
 	sync.Mutex
 }
 
@@ -146,6 +147,22 @@ func (sb *sandbox) Statistics() (map[string]*types.InterfaceStatistics, error) {
 }
 
 func (sb *sandbox) Delete() error {
+	sb.Lock()
+	if sb.inDelete {
+		sb.Unlock()
+		return types.ForbiddenErrorf("another sandbox delete in progress")
+	}
+	// Set the inDelete flag. This will ensure that we don't
+	// update the store until we have completed all the endpoint
+	// leaves and deletes. And when endpoint leaves and deletes
+	// are completed then we can finally delete the sandbox object
+	// altogether from the data store. If the daemon exits
+	// ungracefully in the middle of a sandbox delete this way we
+	// will have all the references to the endpoints in the
+	// sandbox so that we can clean them up when we restart
+	sb.inDelete = true
+	sb.Unlock()
+
 	c := sb.controller
 
 	// Detach from all endpoints
@@ -355,6 +372,10 @@ func releaseOSSboxResources(osSbox osl.Sandbox, ep *endpoint) {
 	joinInfo := ep.joinInfo
 	ep.Unlock()
 
+	if joinInfo == nil {
+		return
+	}
+
 	// Remove non-interface routes.
 	for _, r := range joinInfo.StaticRoutes {
 		if err := osSbox.RemoveStaticRoute(r); err != nil {
@@ -386,6 +407,7 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error {
 		sb.Unlock()
 		return nil
 	}
+	inDelete := sb.inDelete
 	sb.Unlock()
 
 	ep.Lock()
@@ -425,7 +447,16 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error {
 			}
 		}
 	}
-	return sb.storeUpdate()
+
+	// Only update the store if we did not come here as part of
+	// sandbox delete. If we came here as part of delete then do
+	// not bother updating the store. The sandbox object will be
+	// deleted anyway
+	if !inDelete {
+		return sb.storeUpdate()
+	}
+
+	return nil
 }
 
 func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {
@@ -437,6 +468,7 @@ func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {
 
 	sb.Lock()
 	osSbox := sb.osSbox
+	inDelete := sb.inDelete
 	sb.Unlock()
 	if osSbox != nil {
 		releaseOSSboxResources(osSbox, ep)
@@ -480,7 +512,15 @@ func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {
 		sb.updateGateway(gwepAfter)
 	}
 
-	return sb.storeUpdate()
+	// Only update the store if we did not come here as part of
+	// sandbox delete. If we came here as part of delete then do
+	// not bother updating the store. The sandbox object will be
+	// deleted anyway
+	if !inDelete {
+		return sb.storeUpdate()
+	}
+
+	return nil
 }
 
 const (

+ 12 - 1
vendor/src/github.com/docker/libnetwork/sandbox_store.go

@@ -123,6 +123,8 @@ func (sb *sandbox) storeUpdate() error {
 		ID: sb.id,
 	}
 
+retry:
+	sbs.Eps = nil
 	for _, ep := range sb.getConnectedEndpoints() {
 		eps := epState{
 			Nid: ep.getNetwork().ID(),
@@ -132,7 +134,16 @@ func (sb *sandbox) storeUpdate() error {
 		sbs.Eps = append(sbs.Eps, eps)
 	}
 
-	return sb.controller.updateToStore(sbs)
+	err := sb.controller.updateToStore(sbs)
+	if err == datastore.ErrKeyModified {
+		// When we get ErrKeyModified it is sufficient to just
+		// go back and retry.  No need to get the object from
+		// the store because we always regenerate the store
+		// state from in memory sandbox state
+		goto retry
+	}
+
+	return err
 }
 
 func (sb *sandbox) storeDelete() error {

+ 32 - 0
vendor/src/github.com/docker/libnetwork/store.go

@@ -82,6 +82,38 @@ func (c *controller) getNetworkFromStore(nid string) (*network, error) {
 	return nil, fmt.Errorf("network %s not found", nid)
 }
 
+func (c *controller) getNetworksForScope(scope string) ([]*network, error) {
+	var nl []*network
+
+	store := c.getStore(scope)
+	if store == nil {
+		return nil, nil
+	}
+
+	kvol, err := store.List(datastore.Key(datastore.NetworkKeyPrefix),
+		&network{ctrlr: c})
+	if err != nil && err != datastore.ErrKeyNotFound {
+		return nil, fmt.Errorf("failed to get networks for scope %s: %v",
+			scope, err)
+	}
+
+	for _, kvo := range kvol {
+		n := kvo.(*network)
+		n.ctrlr = c
+
+		ec := &endpointCnt{n: n}
+		err = store.GetObject(datastore.Key(ec.Key()...), ec)
+		if err != nil {
+			return nil, fmt.Errorf("could not find endpoint count key %s for network %s while listing: %v", datastore.Key(ec.Key()...), n.Name(), err)
+		}
+
+		n.epCnt = ec
+		nl = append(nl, n)
+	}
+
+	return nl, nil
+}
+
 func (c *controller) getNetworksFromStore() ([]*network, error) {
 	var nl []*network