Pārlūkot izejas kodu

Merge pull request #2146 from ctelfer/fix-overlay-vxlan-races

Fix overlay vxlan races
Flavio Crisciani 7 gadi atpakaļ
vecāks
revīzija
0f593ae92b

+ 1 - 9
libnetwork/drivers/overlay/joinleave.go

@@ -47,18 +47,10 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
 		return fmt.Errorf("couldn't get vxlan id for %q: %v", s.subnetIP.String(), err)
 	}
 
-	if err := n.joinSandbox(false); err != nil {
+	if err := n.joinSandbox(s, false, true); err != nil {
 		return fmt.Errorf("network sandbox join failed: %v", err)
 	}
 
-	if err := n.joinSubnetSandbox(s, false); err != nil {
-		return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err)
-	}
-
-	// joinSubnetSandbox gets called when an endpoint comes up on a new subnet in the
-	// overlay network. Hence the Endpoint count should be updated outside joinSubnetSandbox
-	n.incEndpointCount()
-
 	sbox := n.sandbox()
 
 	overlayIfName, containerIfName, err := createVethPair()

+ 95 - 76
libnetwork/drivers/overlay/ov_network.go

@@ -39,7 +39,7 @@ var (
 type networkTable map[string]*network
 
 type subnet struct {
-	once      *sync.Once
+	sboxInit  bool
 	vxlanName string
 	brName    string
 	vni       uint32
@@ -63,7 +63,7 @@ type network struct {
 	endpoints endpointTable
 	driver    *driver
 	joinCnt   int
-	once      *sync.Once
+	sboxInit  bool
 	initEpoch int
 	initErr   error
 	subnets   []*subnet
@@ -150,7 +150,6 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo d
 		id:        id,
 		driver:    d,
 		endpoints: endpointTable{},
-		once:      &sync.Once{},
 		subnets:   []*subnet{},
 	}
 
@@ -193,7 +192,6 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo d
 		s := &subnet{
 			subnetIP: ipd.Pool,
 			gwIP:     ipd.Gateway,
-			once:     &sync.Once{},
 		}
 
 		if len(vnis) != 0 {
@@ -277,7 +275,7 @@ func (d *driver) DeleteNetwork(nid string) error {
 			logrus.Warnf("Failed to delete overlay endpoint %.7s from local store: %v", ep.id, err)
 		}
 	}
-	// flush the peerDB entries
+
 	doPeerFlush = true
 	delete(d.networks, nid)
 
@@ -304,29 +302,54 @@ func (d *driver) RevokeExternalConnectivity(nid, eid string) error {
 	return nil
 }
 
-func (n *network) incEndpointCount() {
-	n.Lock()
-	defer n.Unlock()
-	n.joinCnt++
-}
-
-func (n *network) joinSandbox(restore bool) error {
+func (n *network) joinSandbox(s *subnet, restore bool, incJoinCount bool) error {
 	// If there is a race between two go routines here only one will win
 	// the other will wait.
-	n.once.Do(func() {
-		// save the error status of initSandbox in n.initErr so that
-		// all the racing go routines are able to know the status.
+	networkOnce.Do(networkOnceInit)
+
+	n.Lock()
+	// If non-restore initialization occurred and was successful then
+	// tell the peerDB to initialize the sandbox with all the peers
+	// previously received from networkdb.  But only do this after
+	// unlocking the network.  Otherwise we could deadlock with
+	// on the peerDB channel while peerDB is waiting for the network lock.
+	var doInitPeerDB bool
+	defer func() {
+		n.Unlock()
+		if doInitPeerDB {
+			n.driver.initSandboxPeerDB(n.id)
+		}
+	}()
+
+	if !n.sboxInit {
 		n.initErr = n.initSandbox(restore)
-	})
+		doInitPeerDB = n.initErr == nil && !restore
+		// If there was an error, we cannot recover it
+		n.sboxInit = true
+	}
 
-	return n.initErr
-}
+	if n.initErr != nil {
+		return fmt.Errorf("network sandbox join failed: %v", n.initErr)
+	}
 
-func (n *network) joinSubnetSandbox(s *subnet, restore bool) error {
-	s.once.Do(func() {
-		s.initErr = n.initSubnetSandbox(s, restore)
-	})
-	return s.initErr
+	subnetErr := s.initErr
+	if !s.sboxInit {
+		subnetErr = n.initSubnetSandbox(s, restore)
+		// We can recover from these errors, but not on restore
+		if restore || subnetErr == nil {
+			s.initErr = subnetErr
+			s.sboxInit = true
+		}
+	}
+	if subnetErr != nil {
+		return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), subnetErr)
+	}
+
+	if incJoinCount {
+		n.joinCnt++
+	}
+
+	return nil
 }
 
 func (n *network) leaveSandbox() {
@@ -337,15 +360,14 @@ func (n *network) leaveSandbox() {
 		return
 	}
 
-	// We are about to destroy sandbox since the container is leaving the network
-	// Reinitialize the once variable so that we will be able to trigger one time
-	// sandbox initialization(again) when another container joins subsequently.
-	n.once = &sync.Once{}
+	n.destroySandbox()
+
+	n.sboxInit = false
+	n.initErr = nil
 	for _, s := range n.subnets {
-		s.once = &sync.Once{}
+		s.sboxInit = false
+		s.initErr = nil
 	}
-
-	n.destroySandbox()
 }
 
 // to be called while holding network lock
@@ -478,7 +500,7 @@ func (n *network) generateVxlanName(s *subnet) string {
 		id = n.id[:5]
 	}
 
-	return "vx-" + fmt.Sprintf("%06x", n.vxlanID(s)) + "-" + id
+	return fmt.Sprintf("vx-%06x-%v", s.vni, id)
 }
 
 func (n *network) generateBridgeName(s *subnet) string {
@@ -491,7 +513,7 @@ func (n *network) generateBridgeName(s *subnet) string {
 }
 
 func (n *network) getBridgeNamePrefix(s *subnet) string {
-	return "ov-" + fmt.Sprintf("%06x", n.vxlanID(s))
+	return fmt.Sprintf("ov-%06x", s.vni)
 }
 
 func checkOverlap(nw *net.IPNet) error {
@@ -513,7 +535,7 @@ func checkOverlap(nw *net.IPNet) error {
 }
 
 func (n *network) restoreSubnetSandbox(s *subnet, brName, vxlanName string) error {
-	sbox := n.sandbox()
+	sbox := n.sbox
 
 	// restore overlay osl sandbox
 	Ifaces := make(map[string][]osl.IfaceOption)
@@ -542,7 +564,7 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error
 			deleteInterfaceBySubnet(n.getBridgeNamePrefix(s), s)
 		}
 		// Try to delete the vxlan interface by vni if already present
-		deleteVxlanByVNI("", n.vxlanID(s))
+		deleteVxlanByVNI("", s.vni)
 
 		if err := checkOverlap(s.subnetIP); err != nil {
 			return err
@@ -556,24 +578,24 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error
 		// it must a stale namespace from previous
 		// life. Destroy it completely and reclaim resourced.
 		networkMu.Lock()
-		path, ok := vniTbl[n.vxlanID(s)]
+		path, ok := vniTbl[s.vni]
 		networkMu.Unlock()
 
 		if ok {
-			deleteVxlanByVNI(path, n.vxlanID(s))
+			deleteVxlanByVNI(path, s.vni)
 			if err := syscall.Unmount(path, syscall.MNT_FORCE); err != nil {
 				logrus.Errorf("unmount of %s failed: %v", path, err)
 			}
 			os.Remove(path)
 
 			networkMu.Lock()
-			delete(vniTbl, n.vxlanID(s))
+			delete(vniTbl, s.vni)
 			networkMu.Unlock()
 		}
 	}
 
 	// create a bridge and vxlan device for this subnet and move it to the sandbox
-	sbox := n.sandbox()
+	sbox := n.sbox
 
 	if err := sbox.AddInterface(brName, "br",
 		sbox.InterfaceOptions().Address(s.gwIP),
@@ -581,13 +603,30 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error
 		return fmt.Errorf("bridge creation in sandbox failed for subnet %q: %v", s.subnetIP.String(), err)
 	}
 
-	err := createVxlan(vxlanName, n.vxlanID(s), n.maxMTU())
+	err := createVxlan(vxlanName, s.vni, n.maxMTU())
 	if err != nil {
 		return err
 	}
 
 	if err := sbox.AddInterface(vxlanName, "vxlan",
 		sbox.InterfaceOptions().Master(brName)); err != nil {
+		// If adding vxlan device to the overlay namespace fails, remove the bridge interface we
+		// already added to the namespace. This allows the caller to try the setup again.
+		for _, iface := range sbox.Info().Interfaces() {
+			if iface.SrcName() == brName {
+				if ierr := iface.Remove(); ierr != nil {
+					logrus.Errorf("removing bridge failed from ov ns %v failed, %v", n.sbox.Key(), ierr)
+				}
+			}
+		}
+
+		// Also, delete the vxlan interface. Since a global vni id is associated
+		// with the vxlan interface, an orphaned vxlan interface will result in
+		// failure of vxlan device creation if the vni is assigned to some other
+		// network.
+		if deleteErr := deleteInterface(vxlanName); deleteErr != nil {
+			logrus.Warnf("could not delete vxlan interface, %s, error %v, after config error, %v", vxlanName, deleteErr, err)
+		}
 		return fmt.Errorf("vxlan interface creation failed for subnet %q: %v", s.subnetIP.String(), err)
 	}
 
@@ -619,6 +658,7 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error
 	return nil
 }
 
+// Must be called with the network lock
 func (n *network) initSubnetSandbox(s *subnet, restore bool) error {
 	brName := n.generateBridgeName(s)
 	vxlanName := n.generateVxlanName(s)
@@ -633,10 +673,8 @@ func (n *network) initSubnetSandbox(s *subnet, restore bool) error {
 		}
 	}
 
-	n.Lock()
 	s.vxlanName = vxlanName
 	s.brName = brName
-	n.Unlock()
 
 	return nil
 }
@@ -677,11 +715,7 @@ func (n *network) cleanupStaleSandboxes() {
 }
 
 func (n *network) initSandbox(restore bool) error {
-	n.Lock()
 	n.initEpoch++
-	n.Unlock()
-
-	networkOnce.Do(networkOnceInit)
 
 	if !restore {
 		if hostMode {
@@ -711,12 +745,7 @@ func (n *network) initSandbox(restore bool) error {
 	}
 
 	// this is needed to let the peerAdd configure the sandbox
-	n.setSandbox(sbox)
-
-	if !restore {
-		// Initialize the sandbox with all the peers previously received from networkdb
-		n.driver.initSandboxPeerDB(n.id)
-	}
+	n.sbox = sbox
 
 	// If we are in swarm mode, we don't need anymore the watchMiss routine.
 	// This will save 1 thread and 1 netlink socket per network
@@ -734,7 +763,7 @@ func (n *network) initSandbox(restore bool) error {
 		tv := syscall.NsecToTimeval(soTimeout.Nanoseconds())
 		err = nlSock.SetReceiveTimeout(&tv)
 	})
-	n.setNetlinkSocket(nlSock)
+	n.nlSocket = nlSock
 
 	if err == nil {
 		go n.watchMiss(nlSock, key)
@@ -836,7 +865,6 @@ func (d *driver) restoreNetworkFromStore(nid string) *network {
 	if n != nil {
 		n.driver = d
 		n.endpoints = endpointTable{}
-		n.once = &sync.Once{}
 		d.networks[nid] = n
 	}
 	return n
@@ -844,11 +872,11 @@ func (d *driver) restoreNetworkFromStore(nid string) *network {
 
 func (d *driver) network(nid string) *network {
 	d.Lock()
-	defer d.Unlock()
 	n, ok := d.networks[nid]
 	if !ok {
 		n = d.restoreNetworkFromStore(nid)
 	}
+	d.Unlock()
 
 	return n
 }
@@ -869,26 +897,12 @@ func (d *driver) getNetworkFromStore(nid string) *network {
 func (n *network) sandbox() osl.Sandbox {
 	n.Lock()
 	defer n.Unlock()
-
 	return n.sbox
 }
 
-func (n *network) setSandbox(sbox osl.Sandbox) {
-	n.Lock()
-	n.sbox = sbox
-	n.Unlock()
-}
-
-func (n *network) setNetlinkSocket(nlSk *nl.NetlinkSocket) {
-	n.Lock()
-	n.nlSocket = nlSk
-	n.Unlock()
-}
-
 func (n *network) vxlanID(s *subnet) uint32 {
 	n.Lock()
 	defer n.Unlock()
-
 	return s.vni
 }
 
@@ -997,7 +1011,6 @@ func (n *network) SetValue(value []byte) error {
 				subnetIP: subnetIP,
 				gwIP:     gwIP,
 				vni:      vni,
-				once:     &sync.Once{},
 			}
 			n.subnets = append(n.subnets, s)
 		} else {
@@ -1023,7 +1036,10 @@ func (n *network) writeToStore() error {
 }
 
 func (n *network) releaseVxlanID() ([]uint32, error) {
-	if len(n.subnets) == 0 {
+	n.Lock()
+	nSubnets := len(n.subnets)
+	n.Unlock()
+	if nSubnets == 0 {
 		return nil, nil
 	}
 
@@ -1039,14 +1055,17 @@ func (n *network) releaseVxlanID() ([]uint32, error) {
 		}
 	}
 	var vnis []uint32
+	n.Lock()
 	for _, s := range n.subnets {
 		if n.driver.vxlanIdm != nil {
-			vni := n.vxlanID(s)
-			vnis = append(vnis, vni)
-			n.driver.vxlanIdm.Release(uint64(vni))
+			vnis = append(vnis, s.vni)
 		}
+		s.vni = 0
+	}
+	n.Unlock()
 
-		n.setVxlanID(s, 0)
+	for _, vni := range vnis {
+		n.driver.vxlanIdm.Release(uint64(vni))
 	}
 
 	return vnis, nil
@@ -1054,7 +1073,7 @@ func (n *network) releaseVxlanID() ([]uint32, error) {
 
 func (n *network) obtainVxlanID(s *subnet) error {
 	//return if the subnet already has a vxlan id assigned
-	if s.vni != 0 {
+	if n.vxlanID(s) != 0 {
 		return nil
 	}
 
@@ -1067,7 +1086,7 @@ func (n *network) obtainVxlanID(s *subnet) error {
 			return fmt.Errorf("getting network %q from datastore failed %v", n.id, err)
 		}
 
-		if s.vni == 0 {
+		if n.vxlanID(s) == 0 {
 			vxlanID, err := n.driver.vxlanIdm.GetID(true)
 			if err != nil {
 				return fmt.Errorf("failed to allocate vxlan id: %v", err)

+ 2 - 17
libnetwork/drivers/overlay/overlay.go

@@ -105,17 +105,6 @@ func Init(dc driverapi.DriverCallback, config map[string]interface{}) error {
 		logrus.Warnf("Failure during overlay endpoints restore: %v", err)
 	}
 
-	// If an error happened when the network join the sandbox during the endpoints restore
-	// we should reset it now along with the once variable, so that subsequent endpoint joins
-	// outside of the restore path can potentially fix the network join and succeed.
-	for nid, n := range d.networks {
-		if n.initErr != nil {
-			logrus.Infof("resetting init error and once variable for network %s after unsuccessful endpoint restore: %v", nid, n.initErr)
-			n.initErr = nil
-			n.once = &sync.Once{}
-		}
-	}
-
 	return dc.RegisterDriver(networkType, d, c)
 }
 
@@ -151,14 +140,10 @@ func (d *driver) restoreEndpoints() error {
 			return fmt.Errorf("could not find subnet for endpoint %s", ep.id)
 		}
 
-		if err := n.joinSandbox(true); err != nil {
+		if err := n.joinSandbox(s, true, true); err != nil {
 			return fmt.Errorf("restore network sandbox failed: %v", err)
 		}
 
-		if err := n.joinSubnetSandbox(s, true); err != nil {
-			return fmt.Errorf("restore subnet sandbox failed for %q: %v", s.subnetIP.String(), err)
-		}
-
 		Ifaces := make(map[string][]osl.IfaceOption)
 		vethIfaceOption := make([]osl.IfaceOption, 1)
 		vethIfaceOption = append(vethIfaceOption, n.sbox.InterfaceOptions().Master(s.brName))
@@ -166,10 +151,10 @@ func (d *driver) restoreEndpoints() error {
 
 		err := n.sbox.Restore(Ifaces, nil, nil, nil)
 		if err != nil {
+			n.leaveSandbox()
 			return fmt.Errorf("failed to restore overlay sandbox: %v", err)
 		}
 
-		n.incEndpointCount()
 		d.peerAdd(ep.nid, ep.id, ep.addr.IP, ep.addr.Mask, ep.mac, net.ParseIP(d.advertiseAddress), false, false, true)
 	}
 	return nil

+ 1 - 1
libnetwork/drivers/overlay/peerdb.go

@@ -384,7 +384,7 @@ func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask
 		return fmt.Errorf("couldn't get vxlan id for %q: %v", s.subnetIP.String(), err)
 	}
 
-	if err := n.joinSubnetSandbox(s, false); err != nil {
+	if err := n.joinSandbox(s, false, false); err != nil {
 		return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err)
 	}
 

+ 10 - 0
libnetwork/osl/interface_linux.go

@@ -289,6 +289,16 @@ func (n *networkNamespace) AddInterface(srcName, dstPrefix string, options ...If
 
 	// Configure the interface now this is moved in the proper namespace.
 	if err := configureInterface(nlh, iface, i); err != nil {
+		// If configuring the device fails move it back to the host namespace
+		// and change the name back to the source name. This allows the caller
+		// to properly cleanup the interface. Its important especially for
+		// interfaces with global attributes, ex: vni id for vxlan interfaces.
+		if nerr := nlh.LinkSetName(iface, i.SrcName()); nerr != nil {
+			logrus.Errorf("renaming interface (%s->%s) failed, %v after config error %v", i.DstName(), i.SrcName(), nerr, err)
+		}
+		if nerr := nlh.LinkSetNsFd(iface, ns.ParseHandlerInt()); nerr != nil {
+			logrus.Errorf("moving inteface %s to host ns failed, %v, after config error %v", i.SrcName(), nerr, err)
+		}
 		return err
 	}