Merge pull request #2146 from ctelfer/fix-overlay-vxlan-races
Fix overlay vxlan races
This commit is contained in:
commit
0f593ae92b
5 changed files with 110 additions and 104 deletions
|
@ -47,18 +47,10 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
|
|||
return fmt.Errorf("couldn't get vxlan id for %q: %v", s.subnetIP.String(), err)
|
||||
}
|
||||
|
||||
if err := n.joinSandbox(false); err != nil {
|
||||
if err := n.joinSandbox(s, false, true); err != nil {
|
||||
return fmt.Errorf("network sandbox join failed: %v", err)
|
||||
}
|
||||
|
||||
if err := n.joinSubnetSandbox(s, false); err != nil {
|
||||
return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err)
|
||||
}
|
||||
|
||||
// joinSubnetSandbox gets called when an endpoint comes up on a new subnet in the
|
||||
// overlay network. Hence the Endpoint count should be updated outside joinSubnetSandbox
|
||||
n.incEndpointCount()
|
||||
|
||||
sbox := n.sandbox()
|
||||
|
||||
overlayIfName, containerIfName, err := createVethPair()
|
||||
|
|
|
@ -39,7 +39,7 @@ var (
|
|||
type networkTable map[string]*network
|
||||
|
||||
type subnet struct {
|
||||
once *sync.Once
|
||||
sboxInit bool
|
||||
vxlanName string
|
||||
brName string
|
||||
vni uint32
|
||||
|
@ -63,7 +63,7 @@ type network struct {
|
|||
endpoints endpointTable
|
||||
driver *driver
|
||||
joinCnt int
|
||||
once *sync.Once
|
||||
sboxInit bool
|
||||
initEpoch int
|
||||
initErr error
|
||||
subnets []*subnet
|
||||
|
@ -150,7 +150,6 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo d
|
|||
id: id,
|
||||
driver: d,
|
||||
endpoints: endpointTable{},
|
||||
once: &sync.Once{},
|
||||
subnets: []*subnet{},
|
||||
}
|
||||
|
||||
|
@ -193,7 +192,6 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo d
|
|||
s := &subnet{
|
||||
subnetIP: ipd.Pool,
|
||||
gwIP: ipd.Gateway,
|
||||
once: &sync.Once{},
|
||||
}
|
||||
|
||||
if len(vnis) != 0 {
|
||||
|
@ -277,7 +275,7 @@ func (d *driver) DeleteNetwork(nid string) error {
|
|||
logrus.Warnf("Failed to delete overlay endpoint %.7s from local store: %v", ep.id, err)
|
||||
}
|
||||
}
|
||||
// flush the peerDB entries
|
||||
|
||||
doPeerFlush = true
|
||||
delete(d.networks, nid)
|
||||
|
||||
|
@ -304,29 +302,54 @@ func (d *driver) RevokeExternalConnectivity(nid, eid string) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (n *network) incEndpointCount() {
|
||||
n.Lock()
|
||||
defer n.Unlock()
|
||||
n.joinCnt++
|
||||
}
|
||||
|
||||
func (n *network) joinSandbox(restore bool) error {
|
||||
func (n *network) joinSandbox(s *subnet, restore bool, incJoinCount bool) error {
|
||||
// If there is a race between two go routines here only one will win
|
||||
// the other will wait.
|
||||
n.once.Do(func() {
|
||||
// save the error status of initSandbox in n.initErr so that
|
||||
// all the racing go routines are able to know the status.
|
||||
networkOnce.Do(networkOnceInit)
|
||||
|
||||
n.Lock()
|
||||
// If non-restore initialization occurred and was successful then
|
||||
// tell the peerDB to initialize the sandbox with all the peers
|
||||
// previously received from networkdb. But only do this after
|
||||
// unlocking the network. Otherwise we could deadlock with
|
||||
// on the peerDB channel while peerDB is waiting for the network lock.
|
||||
var doInitPeerDB bool
|
||||
defer func() {
|
||||
n.Unlock()
|
||||
if doInitPeerDB {
|
||||
n.driver.initSandboxPeerDB(n.id)
|
||||
}
|
||||
}()
|
||||
|
||||
if !n.sboxInit {
|
||||
n.initErr = n.initSandbox(restore)
|
||||
})
|
||||
doInitPeerDB = n.initErr == nil && !restore
|
||||
// If there was an error, we cannot recover it
|
||||
n.sboxInit = true
|
||||
}
|
||||
|
||||
return n.initErr
|
||||
}
|
||||
if n.initErr != nil {
|
||||
return fmt.Errorf("network sandbox join failed: %v", n.initErr)
|
||||
}
|
||||
|
||||
func (n *network) joinSubnetSandbox(s *subnet, restore bool) error {
|
||||
s.once.Do(func() {
|
||||
s.initErr = n.initSubnetSandbox(s, restore)
|
||||
})
|
||||
return s.initErr
|
||||
subnetErr := s.initErr
|
||||
if !s.sboxInit {
|
||||
subnetErr = n.initSubnetSandbox(s, restore)
|
||||
// We can recover from these errors, but not on restore
|
||||
if restore || subnetErr == nil {
|
||||
s.initErr = subnetErr
|
||||
s.sboxInit = true
|
||||
}
|
||||
}
|
||||
if subnetErr != nil {
|
||||
return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), subnetErr)
|
||||
}
|
||||
|
||||
if incJoinCount {
|
||||
n.joinCnt++
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (n *network) leaveSandbox() {
|
||||
|
@ -337,15 +360,14 @@ func (n *network) leaveSandbox() {
|
|||
return
|
||||
}
|
||||
|
||||
// We are about to destroy sandbox since the container is leaving the network
|
||||
// Reinitialize the once variable so that we will be able to trigger one time
|
||||
// sandbox initialization(again) when another container joins subsequently.
|
||||
n.once = &sync.Once{}
|
||||
for _, s := range n.subnets {
|
||||
s.once = &sync.Once{}
|
||||
}
|
||||
|
||||
n.destroySandbox()
|
||||
|
||||
n.sboxInit = false
|
||||
n.initErr = nil
|
||||
for _, s := range n.subnets {
|
||||
s.sboxInit = false
|
||||
s.initErr = nil
|
||||
}
|
||||
}
|
||||
|
||||
// to be called while holding network lock
|
||||
|
@ -478,7 +500,7 @@ func (n *network) generateVxlanName(s *subnet) string {
|
|||
id = n.id[:5]
|
||||
}
|
||||
|
||||
return "vx-" + fmt.Sprintf("%06x", n.vxlanID(s)) + "-" + id
|
||||
return fmt.Sprintf("vx-%06x-%v", s.vni, id)
|
||||
}
|
||||
|
||||
func (n *network) generateBridgeName(s *subnet) string {
|
||||
|
@ -491,7 +513,7 @@ func (n *network) generateBridgeName(s *subnet) string {
|
|||
}
|
||||
|
||||
func (n *network) getBridgeNamePrefix(s *subnet) string {
|
||||
return "ov-" + fmt.Sprintf("%06x", n.vxlanID(s))
|
||||
return fmt.Sprintf("ov-%06x", s.vni)
|
||||
}
|
||||
|
||||
func checkOverlap(nw *net.IPNet) error {
|
||||
|
@ -513,7 +535,7 @@ func checkOverlap(nw *net.IPNet) error {
|
|||
}
|
||||
|
||||
func (n *network) restoreSubnetSandbox(s *subnet, brName, vxlanName string) error {
|
||||
sbox := n.sandbox()
|
||||
sbox := n.sbox
|
||||
|
||||
// restore overlay osl sandbox
|
||||
Ifaces := make(map[string][]osl.IfaceOption)
|
||||
|
@ -542,7 +564,7 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error
|
|||
deleteInterfaceBySubnet(n.getBridgeNamePrefix(s), s)
|
||||
}
|
||||
// Try to delete the vxlan interface by vni if already present
|
||||
deleteVxlanByVNI("", n.vxlanID(s))
|
||||
deleteVxlanByVNI("", s.vni)
|
||||
|
||||
if err := checkOverlap(s.subnetIP); err != nil {
|
||||
return err
|
||||
|
@ -556,24 +578,24 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error
|
|||
// it must a stale namespace from previous
|
||||
// life. Destroy it completely and reclaim resourced.
|
||||
networkMu.Lock()
|
||||
path, ok := vniTbl[n.vxlanID(s)]
|
||||
path, ok := vniTbl[s.vni]
|
||||
networkMu.Unlock()
|
||||
|
||||
if ok {
|
||||
deleteVxlanByVNI(path, n.vxlanID(s))
|
||||
deleteVxlanByVNI(path, s.vni)
|
||||
if err := syscall.Unmount(path, syscall.MNT_FORCE); err != nil {
|
||||
logrus.Errorf("unmount of %s failed: %v", path, err)
|
||||
}
|
||||
os.Remove(path)
|
||||
|
||||
networkMu.Lock()
|
||||
delete(vniTbl, n.vxlanID(s))
|
||||
delete(vniTbl, s.vni)
|
||||
networkMu.Unlock()
|
||||
}
|
||||
}
|
||||
|
||||
// create a bridge and vxlan device for this subnet and move it to the sandbox
|
||||
sbox := n.sandbox()
|
||||
sbox := n.sbox
|
||||
|
||||
if err := sbox.AddInterface(brName, "br",
|
||||
sbox.InterfaceOptions().Address(s.gwIP),
|
||||
|
@ -581,13 +603,30 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error
|
|||
return fmt.Errorf("bridge creation in sandbox failed for subnet %q: %v", s.subnetIP.String(), err)
|
||||
}
|
||||
|
||||
err := createVxlan(vxlanName, n.vxlanID(s), n.maxMTU())
|
||||
err := createVxlan(vxlanName, s.vni, n.maxMTU())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := sbox.AddInterface(vxlanName, "vxlan",
|
||||
sbox.InterfaceOptions().Master(brName)); err != nil {
|
||||
// If adding vxlan device to the overlay namespace fails, remove the bridge interface we
|
||||
// already added to the namespace. This allows the caller to try the setup again.
|
||||
for _, iface := range sbox.Info().Interfaces() {
|
||||
if iface.SrcName() == brName {
|
||||
if ierr := iface.Remove(); ierr != nil {
|
||||
logrus.Errorf("removing bridge failed from ov ns %v failed, %v", n.sbox.Key(), ierr)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Also, delete the vxlan interface. Since a global vni id is associated
|
||||
// with the vxlan interface, an orphaned vxlan interface will result in
|
||||
// failure of vxlan device creation if the vni is assigned to some other
|
||||
// network.
|
||||
if deleteErr := deleteInterface(vxlanName); deleteErr != nil {
|
||||
logrus.Warnf("could not delete vxlan interface, %s, error %v, after config error, %v", vxlanName, deleteErr, err)
|
||||
}
|
||||
return fmt.Errorf("vxlan interface creation failed for subnet %q: %v", s.subnetIP.String(), err)
|
||||
}
|
||||
|
||||
|
@ -619,6 +658,7 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error
|
|||
return nil
|
||||
}
|
||||
|
||||
// Must be called with the network lock
|
||||
func (n *network) initSubnetSandbox(s *subnet, restore bool) error {
|
||||
brName := n.generateBridgeName(s)
|
||||
vxlanName := n.generateVxlanName(s)
|
||||
|
@ -633,10 +673,8 @@ func (n *network) initSubnetSandbox(s *subnet, restore bool) error {
|
|||
}
|
||||
}
|
||||
|
||||
n.Lock()
|
||||
s.vxlanName = vxlanName
|
||||
s.brName = brName
|
||||
n.Unlock()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
@ -677,11 +715,7 @@ func (n *network) cleanupStaleSandboxes() {
|
|||
}
|
||||
|
||||
func (n *network) initSandbox(restore bool) error {
|
||||
n.Lock()
|
||||
n.initEpoch++
|
||||
n.Unlock()
|
||||
|
||||
networkOnce.Do(networkOnceInit)
|
||||
|
||||
if !restore {
|
||||
if hostMode {
|
||||
|
@ -711,12 +745,7 @@ func (n *network) initSandbox(restore bool) error {
|
|||
}
|
||||
|
||||
// this is needed to let the peerAdd configure the sandbox
|
||||
n.setSandbox(sbox)
|
||||
|
||||
if !restore {
|
||||
// Initialize the sandbox with all the peers previously received from networkdb
|
||||
n.driver.initSandboxPeerDB(n.id)
|
||||
}
|
||||
n.sbox = sbox
|
||||
|
||||
// If we are in swarm mode, we don't need anymore the watchMiss routine.
|
||||
// This will save 1 thread and 1 netlink socket per network
|
||||
|
@ -734,7 +763,7 @@ func (n *network) initSandbox(restore bool) error {
|
|||
tv := syscall.NsecToTimeval(soTimeout.Nanoseconds())
|
||||
err = nlSock.SetReceiveTimeout(&tv)
|
||||
})
|
||||
n.setNetlinkSocket(nlSock)
|
||||
n.nlSocket = nlSock
|
||||
|
||||
if err == nil {
|
||||
go n.watchMiss(nlSock, key)
|
||||
|
@ -836,7 +865,6 @@ func (d *driver) restoreNetworkFromStore(nid string) *network {
|
|||
if n != nil {
|
||||
n.driver = d
|
||||
n.endpoints = endpointTable{}
|
||||
n.once = &sync.Once{}
|
||||
d.networks[nid] = n
|
||||
}
|
||||
return n
|
||||
|
@ -844,11 +872,11 @@ func (d *driver) restoreNetworkFromStore(nid string) *network {
|
|||
|
||||
func (d *driver) network(nid string) *network {
|
||||
d.Lock()
|
||||
defer d.Unlock()
|
||||
n, ok := d.networks[nid]
|
||||
if !ok {
|
||||
n = d.restoreNetworkFromStore(nid)
|
||||
}
|
||||
d.Unlock()
|
||||
|
||||
return n
|
||||
}
|
||||
|
@ -869,26 +897,12 @@ func (d *driver) getNetworkFromStore(nid string) *network {
|
|||
func (n *network) sandbox() osl.Sandbox {
|
||||
n.Lock()
|
||||
defer n.Unlock()
|
||||
|
||||
return n.sbox
|
||||
}
|
||||
|
||||
func (n *network) setSandbox(sbox osl.Sandbox) {
|
||||
n.Lock()
|
||||
n.sbox = sbox
|
||||
n.Unlock()
|
||||
}
|
||||
|
||||
func (n *network) setNetlinkSocket(nlSk *nl.NetlinkSocket) {
|
||||
n.Lock()
|
||||
n.nlSocket = nlSk
|
||||
n.Unlock()
|
||||
}
|
||||
|
||||
func (n *network) vxlanID(s *subnet) uint32 {
|
||||
n.Lock()
|
||||
defer n.Unlock()
|
||||
|
||||
return s.vni
|
||||
}
|
||||
|
||||
|
@ -997,7 +1011,6 @@ func (n *network) SetValue(value []byte) error {
|
|||
subnetIP: subnetIP,
|
||||
gwIP: gwIP,
|
||||
vni: vni,
|
||||
once: &sync.Once{},
|
||||
}
|
||||
n.subnets = append(n.subnets, s)
|
||||
} else {
|
||||
|
@ -1023,7 +1036,10 @@ func (n *network) writeToStore() error {
|
|||
}
|
||||
|
||||
func (n *network) releaseVxlanID() ([]uint32, error) {
|
||||
if len(n.subnets) == 0 {
|
||||
n.Lock()
|
||||
nSubnets := len(n.subnets)
|
||||
n.Unlock()
|
||||
if nSubnets == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
|
@ -1039,14 +1055,17 @@ func (n *network) releaseVxlanID() ([]uint32, error) {
|
|||
}
|
||||
}
|
||||
var vnis []uint32
|
||||
n.Lock()
|
||||
for _, s := range n.subnets {
|
||||
if n.driver.vxlanIdm != nil {
|
||||
vni := n.vxlanID(s)
|
||||
vnis = append(vnis, vni)
|
||||
n.driver.vxlanIdm.Release(uint64(vni))
|
||||
vnis = append(vnis, s.vni)
|
||||
}
|
||||
s.vni = 0
|
||||
}
|
||||
n.Unlock()
|
||||
|
||||
n.setVxlanID(s, 0)
|
||||
for _, vni := range vnis {
|
||||
n.driver.vxlanIdm.Release(uint64(vni))
|
||||
}
|
||||
|
||||
return vnis, nil
|
||||
|
@ -1054,7 +1073,7 @@ func (n *network) releaseVxlanID() ([]uint32, error) {
|
|||
|
||||
func (n *network) obtainVxlanID(s *subnet) error {
|
||||
//return if the subnet already has a vxlan id assigned
|
||||
if s.vni != 0 {
|
||||
if n.vxlanID(s) != 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -1067,7 +1086,7 @@ func (n *network) obtainVxlanID(s *subnet) error {
|
|||
return fmt.Errorf("getting network %q from datastore failed %v", n.id, err)
|
||||
}
|
||||
|
||||
if s.vni == 0 {
|
||||
if n.vxlanID(s) == 0 {
|
||||
vxlanID, err := n.driver.vxlanIdm.GetID(true)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to allocate vxlan id: %v", err)
|
||||
|
|
|
@ -105,17 +105,6 @@ func Init(dc driverapi.DriverCallback, config map[string]interface{}) error {
|
|||
logrus.Warnf("Failure during overlay endpoints restore: %v", err)
|
||||
}
|
||||
|
||||
// If an error happened when the network join the sandbox during the endpoints restore
|
||||
// we should reset it now along with the once variable, so that subsequent endpoint joins
|
||||
// outside of the restore path can potentially fix the network join and succeed.
|
||||
for nid, n := range d.networks {
|
||||
if n.initErr != nil {
|
||||
logrus.Infof("resetting init error and once variable for network %s after unsuccessful endpoint restore: %v", nid, n.initErr)
|
||||
n.initErr = nil
|
||||
n.once = &sync.Once{}
|
||||
}
|
||||
}
|
||||
|
||||
return dc.RegisterDriver(networkType, d, c)
|
||||
}
|
||||
|
||||
|
@ -151,14 +140,10 @@ func (d *driver) restoreEndpoints() error {
|
|||
return fmt.Errorf("could not find subnet for endpoint %s", ep.id)
|
||||
}
|
||||
|
||||
if err := n.joinSandbox(true); err != nil {
|
||||
if err := n.joinSandbox(s, true, true); err != nil {
|
||||
return fmt.Errorf("restore network sandbox failed: %v", err)
|
||||
}
|
||||
|
||||
if err := n.joinSubnetSandbox(s, true); err != nil {
|
||||
return fmt.Errorf("restore subnet sandbox failed for %q: %v", s.subnetIP.String(), err)
|
||||
}
|
||||
|
||||
Ifaces := make(map[string][]osl.IfaceOption)
|
||||
vethIfaceOption := make([]osl.IfaceOption, 1)
|
||||
vethIfaceOption = append(vethIfaceOption, n.sbox.InterfaceOptions().Master(s.brName))
|
||||
|
@ -166,10 +151,10 @@ func (d *driver) restoreEndpoints() error {
|
|||
|
||||
err := n.sbox.Restore(Ifaces, nil, nil, nil)
|
||||
if err != nil {
|
||||
n.leaveSandbox()
|
||||
return fmt.Errorf("failed to restore overlay sandbox: %v", err)
|
||||
}
|
||||
|
||||
n.incEndpointCount()
|
||||
d.peerAdd(ep.nid, ep.id, ep.addr.IP, ep.addr.Mask, ep.mac, net.ParseIP(d.advertiseAddress), false, false, true)
|
||||
}
|
||||
return nil
|
||||
|
|
|
@ -384,7 +384,7 @@ func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask
|
|||
return fmt.Errorf("couldn't get vxlan id for %q: %v", s.subnetIP.String(), err)
|
||||
}
|
||||
|
||||
if err := n.joinSubnetSandbox(s, false); err != nil {
|
||||
if err := n.joinSandbox(s, false, false); err != nil {
|
||||
return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err)
|
||||
}
|
||||
|
||||
|
|
|
@ -289,6 +289,16 @@ func (n *networkNamespace) AddInterface(srcName, dstPrefix string, options ...If
|
|||
|
||||
// Configure the interface now this is moved in the proper namespace.
|
||||
if err := configureInterface(nlh, iface, i); err != nil {
|
||||
// If configuring the device fails move it back to the host namespace
|
||||
// and change the name back to the source name. This allows the caller
|
||||
// to properly cleanup the interface. Its important especially for
|
||||
// interfaces with global attributes, ex: vni id for vxlan interfaces.
|
||||
if nerr := nlh.LinkSetName(iface, i.SrcName()); nerr != nil {
|
||||
logrus.Errorf("renaming interface (%s->%s) failed, %v after config error %v", i.DstName(), i.SrcName(), nerr, err)
|
||||
}
|
||||
if nerr := nlh.LinkSetNsFd(iface, ns.ParseHandlerInt()); nerr != nil {
|
||||
logrus.Errorf("moving inteface %s to host ns failed, %v, after config error %v", i.SrcName(), nerr, err)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue