Merge pull request #2146 from ctelfer/fix-overlay-vxlan-races

Fix overlay vxlan races
This commit is contained in:
Flavio Crisciani 2018-07-11 10:41:46 -07:00 committed by GitHub
commit 0f593ae92b
5 changed files with 110 additions and 104 deletions

View file

@ -47,18 +47,10 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
return fmt.Errorf("couldn't get vxlan id for %q: %v", s.subnetIP.String(), err)
}
if err := n.joinSandbox(false); err != nil {
if err := n.joinSandbox(s, false, true); err != nil {
return fmt.Errorf("network sandbox join failed: %v", err)
}
if err := n.joinSubnetSandbox(s, false); err != nil {
return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err)
}
// joinSubnetSandbox gets called when an endpoint comes up on a new subnet in the
// overlay network. Hence the Endpoint count should be updated outside joinSubnetSandbox
n.incEndpointCount()
sbox := n.sandbox()
overlayIfName, containerIfName, err := createVethPair()

View file

@ -39,7 +39,7 @@ var (
type networkTable map[string]*network
type subnet struct {
once *sync.Once
sboxInit bool
vxlanName string
brName string
vni uint32
@ -63,7 +63,7 @@ type network struct {
endpoints endpointTable
driver *driver
joinCnt int
once *sync.Once
sboxInit bool
initEpoch int
initErr error
subnets []*subnet
@ -150,7 +150,6 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo d
id: id,
driver: d,
endpoints: endpointTable{},
once: &sync.Once{},
subnets: []*subnet{},
}
@ -193,7 +192,6 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo d
s := &subnet{
subnetIP: ipd.Pool,
gwIP: ipd.Gateway,
once: &sync.Once{},
}
if len(vnis) != 0 {
@ -277,7 +275,7 @@ func (d *driver) DeleteNetwork(nid string) error {
logrus.Warnf("Failed to delete overlay endpoint %.7s from local store: %v", ep.id, err)
}
}
// flush the peerDB entries
doPeerFlush = true
delete(d.networks, nid)
@ -304,29 +302,54 @@ func (d *driver) RevokeExternalConnectivity(nid, eid string) error {
return nil
}
func (n *network) incEndpointCount() {
n.Lock()
defer n.Unlock()
n.joinCnt++
}
func (n *network) joinSandbox(restore bool) error {
func (n *network) joinSandbox(s *subnet, restore bool, incJoinCount bool) error {
// If there is a race between two go routines here only one will win
// the other will wait.
n.once.Do(func() {
// save the error status of initSandbox in n.initErr so that
// all the racing go routines are able to know the status.
networkOnce.Do(networkOnceInit)
n.Lock()
// If non-restore initialization occurred and was successful then
// tell the peerDB to initialize the sandbox with all the peers
// previously received from networkdb. But only do this after
// unlocking the network. Otherwise we could deadlock with
// on the peerDB channel while peerDB is waiting for the network lock.
var doInitPeerDB bool
defer func() {
n.Unlock()
if doInitPeerDB {
n.driver.initSandboxPeerDB(n.id)
}
}()
if !n.sboxInit {
n.initErr = n.initSandbox(restore)
})
doInitPeerDB = n.initErr == nil && !restore
// If there was an error, we cannot recover it
n.sboxInit = true
}
return n.initErr
}
if n.initErr != nil {
return fmt.Errorf("network sandbox join failed: %v", n.initErr)
}
func (n *network) joinSubnetSandbox(s *subnet, restore bool) error {
s.once.Do(func() {
s.initErr = n.initSubnetSandbox(s, restore)
})
return s.initErr
subnetErr := s.initErr
if !s.sboxInit {
subnetErr = n.initSubnetSandbox(s, restore)
// We can recover from these errors, but not on restore
if restore || subnetErr == nil {
s.initErr = subnetErr
s.sboxInit = true
}
}
if subnetErr != nil {
return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), subnetErr)
}
if incJoinCount {
n.joinCnt++
}
return nil
}
func (n *network) leaveSandbox() {
@ -337,15 +360,14 @@ func (n *network) leaveSandbox() {
return
}
// We are about to destroy sandbox since the container is leaving the network
// Reinitialize the once variable so that we will be able to trigger one time
// sandbox initialization(again) when another container joins subsequently.
n.once = &sync.Once{}
for _, s := range n.subnets {
s.once = &sync.Once{}
}
n.destroySandbox()
n.sboxInit = false
n.initErr = nil
for _, s := range n.subnets {
s.sboxInit = false
s.initErr = nil
}
}
// to be called while holding network lock
@ -478,7 +500,7 @@ func (n *network) generateVxlanName(s *subnet) string {
id = n.id[:5]
}
return "vx-" + fmt.Sprintf("%06x", n.vxlanID(s)) + "-" + id
return fmt.Sprintf("vx-%06x-%v", s.vni, id)
}
func (n *network) generateBridgeName(s *subnet) string {
@ -491,7 +513,7 @@ func (n *network) generateBridgeName(s *subnet) string {
}
func (n *network) getBridgeNamePrefix(s *subnet) string {
return "ov-" + fmt.Sprintf("%06x", n.vxlanID(s))
return fmt.Sprintf("ov-%06x", s.vni)
}
func checkOverlap(nw *net.IPNet) error {
@ -513,7 +535,7 @@ func checkOverlap(nw *net.IPNet) error {
}
func (n *network) restoreSubnetSandbox(s *subnet, brName, vxlanName string) error {
sbox := n.sandbox()
sbox := n.sbox
// restore overlay osl sandbox
Ifaces := make(map[string][]osl.IfaceOption)
@ -542,7 +564,7 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error
deleteInterfaceBySubnet(n.getBridgeNamePrefix(s), s)
}
// Try to delete the vxlan interface by vni if already present
deleteVxlanByVNI("", n.vxlanID(s))
deleteVxlanByVNI("", s.vni)
if err := checkOverlap(s.subnetIP); err != nil {
return err
@ -556,24 +578,24 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error
// it must a stale namespace from previous
// life. Destroy it completely and reclaim resourced.
networkMu.Lock()
path, ok := vniTbl[n.vxlanID(s)]
path, ok := vniTbl[s.vni]
networkMu.Unlock()
if ok {
deleteVxlanByVNI(path, n.vxlanID(s))
deleteVxlanByVNI(path, s.vni)
if err := syscall.Unmount(path, syscall.MNT_FORCE); err != nil {
logrus.Errorf("unmount of %s failed: %v", path, err)
}
os.Remove(path)
networkMu.Lock()
delete(vniTbl, n.vxlanID(s))
delete(vniTbl, s.vni)
networkMu.Unlock()
}
}
// create a bridge and vxlan device for this subnet and move it to the sandbox
sbox := n.sandbox()
sbox := n.sbox
if err := sbox.AddInterface(brName, "br",
sbox.InterfaceOptions().Address(s.gwIP),
@ -581,13 +603,30 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error
return fmt.Errorf("bridge creation in sandbox failed for subnet %q: %v", s.subnetIP.String(), err)
}
err := createVxlan(vxlanName, n.vxlanID(s), n.maxMTU())
err := createVxlan(vxlanName, s.vni, n.maxMTU())
if err != nil {
return err
}
if err := sbox.AddInterface(vxlanName, "vxlan",
sbox.InterfaceOptions().Master(brName)); err != nil {
// If adding vxlan device to the overlay namespace fails, remove the bridge interface we
// already added to the namespace. This allows the caller to try the setup again.
for _, iface := range sbox.Info().Interfaces() {
if iface.SrcName() == brName {
if ierr := iface.Remove(); ierr != nil {
logrus.Errorf("removing bridge failed from ov ns %v failed, %v", n.sbox.Key(), ierr)
}
}
}
// Also, delete the vxlan interface. Since a global vni id is associated
// with the vxlan interface, an orphaned vxlan interface will result in
// failure of vxlan device creation if the vni is assigned to some other
// network.
if deleteErr := deleteInterface(vxlanName); deleteErr != nil {
logrus.Warnf("could not delete vxlan interface, %s, error %v, after config error, %v", vxlanName, deleteErr, err)
}
return fmt.Errorf("vxlan interface creation failed for subnet %q: %v", s.subnetIP.String(), err)
}
@ -619,6 +658,7 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error
return nil
}
// Must be called with the network lock
func (n *network) initSubnetSandbox(s *subnet, restore bool) error {
brName := n.generateBridgeName(s)
vxlanName := n.generateVxlanName(s)
@ -633,10 +673,8 @@ func (n *network) initSubnetSandbox(s *subnet, restore bool) error {
}
}
n.Lock()
s.vxlanName = vxlanName
s.brName = brName
n.Unlock()
return nil
}
@ -677,11 +715,7 @@ func (n *network) cleanupStaleSandboxes() {
}
func (n *network) initSandbox(restore bool) error {
n.Lock()
n.initEpoch++
n.Unlock()
networkOnce.Do(networkOnceInit)
if !restore {
if hostMode {
@ -711,12 +745,7 @@ func (n *network) initSandbox(restore bool) error {
}
// this is needed to let the peerAdd configure the sandbox
n.setSandbox(sbox)
if !restore {
// Initialize the sandbox with all the peers previously received from networkdb
n.driver.initSandboxPeerDB(n.id)
}
n.sbox = sbox
// If we are in swarm mode, we don't need anymore the watchMiss routine.
// This will save 1 thread and 1 netlink socket per network
@ -734,7 +763,7 @@ func (n *network) initSandbox(restore bool) error {
tv := syscall.NsecToTimeval(soTimeout.Nanoseconds())
err = nlSock.SetReceiveTimeout(&tv)
})
n.setNetlinkSocket(nlSock)
n.nlSocket = nlSock
if err == nil {
go n.watchMiss(nlSock, key)
@ -836,7 +865,6 @@ func (d *driver) restoreNetworkFromStore(nid string) *network {
if n != nil {
n.driver = d
n.endpoints = endpointTable{}
n.once = &sync.Once{}
d.networks[nid] = n
}
return n
@ -844,11 +872,11 @@ func (d *driver) restoreNetworkFromStore(nid string) *network {
func (d *driver) network(nid string) *network {
d.Lock()
defer d.Unlock()
n, ok := d.networks[nid]
if !ok {
n = d.restoreNetworkFromStore(nid)
}
d.Unlock()
return n
}
@ -869,26 +897,12 @@ func (d *driver) getNetworkFromStore(nid string) *network {
func (n *network) sandbox() osl.Sandbox {
n.Lock()
defer n.Unlock()
return n.sbox
}
func (n *network) setSandbox(sbox osl.Sandbox) {
n.Lock()
n.sbox = sbox
n.Unlock()
}
func (n *network) setNetlinkSocket(nlSk *nl.NetlinkSocket) {
n.Lock()
n.nlSocket = nlSk
n.Unlock()
}
func (n *network) vxlanID(s *subnet) uint32 {
n.Lock()
defer n.Unlock()
return s.vni
}
@ -997,7 +1011,6 @@ func (n *network) SetValue(value []byte) error {
subnetIP: subnetIP,
gwIP: gwIP,
vni: vni,
once: &sync.Once{},
}
n.subnets = append(n.subnets, s)
} else {
@ -1023,7 +1036,10 @@ func (n *network) writeToStore() error {
}
func (n *network) releaseVxlanID() ([]uint32, error) {
if len(n.subnets) == 0 {
n.Lock()
nSubnets := len(n.subnets)
n.Unlock()
if nSubnets == 0 {
return nil, nil
}
@ -1039,14 +1055,17 @@ func (n *network) releaseVxlanID() ([]uint32, error) {
}
}
var vnis []uint32
n.Lock()
for _, s := range n.subnets {
if n.driver.vxlanIdm != nil {
vni := n.vxlanID(s)
vnis = append(vnis, vni)
n.driver.vxlanIdm.Release(uint64(vni))
vnis = append(vnis, s.vni)
}
s.vni = 0
}
n.Unlock()
n.setVxlanID(s, 0)
for _, vni := range vnis {
n.driver.vxlanIdm.Release(uint64(vni))
}
return vnis, nil
@ -1054,7 +1073,7 @@ func (n *network) releaseVxlanID() ([]uint32, error) {
func (n *network) obtainVxlanID(s *subnet) error {
//return if the subnet already has a vxlan id assigned
if s.vni != 0 {
if n.vxlanID(s) != 0 {
return nil
}
@ -1067,7 +1086,7 @@ func (n *network) obtainVxlanID(s *subnet) error {
return fmt.Errorf("getting network %q from datastore failed %v", n.id, err)
}
if s.vni == 0 {
if n.vxlanID(s) == 0 {
vxlanID, err := n.driver.vxlanIdm.GetID(true)
if err != nil {
return fmt.Errorf("failed to allocate vxlan id: %v", err)

View file

@ -105,17 +105,6 @@ func Init(dc driverapi.DriverCallback, config map[string]interface{}) error {
logrus.Warnf("Failure during overlay endpoints restore: %v", err)
}
// If an error happened when the network join the sandbox during the endpoints restore
// we should reset it now along with the once variable, so that subsequent endpoint joins
// outside of the restore path can potentially fix the network join and succeed.
for nid, n := range d.networks {
if n.initErr != nil {
logrus.Infof("resetting init error and once variable for network %s after unsuccessful endpoint restore: %v", nid, n.initErr)
n.initErr = nil
n.once = &sync.Once{}
}
}
return dc.RegisterDriver(networkType, d, c)
}
@ -151,14 +140,10 @@ func (d *driver) restoreEndpoints() error {
return fmt.Errorf("could not find subnet for endpoint %s", ep.id)
}
if err := n.joinSandbox(true); err != nil {
if err := n.joinSandbox(s, true, true); err != nil {
return fmt.Errorf("restore network sandbox failed: %v", err)
}
if err := n.joinSubnetSandbox(s, true); err != nil {
return fmt.Errorf("restore subnet sandbox failed for %q: %v", s.subnetIP.String(), err)
}
Ifaces := make(map[string][]osl.IfaceOption)
vethIfaceOption := make([]osl.IfaceOption, 1)
vethIfaceOption = append(vethIfaceOption, n.sbox.InterfaceOptions().Master(s.brName))
@ -166,10 +151,10 @@ func (d *driver) restoreEndpoints() error {
err := n.sbox.Restore(Ifaces, nil, nil, nil)
if err != nil {
n.leaveSandbox()
return fmt.Errorf("failed to restore overlay sandbox: %v", err)
}
n.incEndpointCount()
d.peerAdd(ep.nid, ep.id, ep.addr.IP, ep.addr.Mask, ep.mac, net.ParseIP(d.advertiseAddress), false, false, true)
}
return nil

View file

@ -384,7 +384,7 @@ func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask
return fmt.Errorf("couldn't get vxlan id for %q: %v", s.subnetIP.String(), err)
}
if err := n.joinSubnetSandbox(s, false); err != nil {
if err := n.joinSandbox(s, false, false); err != nil {
return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err)
}

View file

@ -289,6 +289,16 @@ func (n *networkNamespace) AddInterface(srcName, dstPrefix string, options ...If
// Configure the interface now this is moved in the proper namespace.
if err := configureInterface(nlh, iface, i); err != nil {
// If configuring the device fails move it back to the host namespace
// and change the name back to the source name. This allows the caller
// to properly cleanup the interface. Its important especially for
// interfaces with global attributes, ex: vni id for vxlan interfaces.
if nerr := nlh.LinkSetName(iface, i.SrcName()); nerr != nil {
logrus.Errorf("renaming interface (%s->%s) failed, %v after config error %v", i.DstName(), i.SrcName(), nerr, err)
}
if nerr := nlh.LinkSetNsFd(iface, ns.ParseHandlerInt()); nerr != nil {
logrus.Errorf("moving inteface %s to host ns failed, %v, after config error %v", i.SrcName(), nerr, err)
}
return err
}