Browse Source

Merge pull request #2138 from ctelfer/scalable-lb

Improve load balancing scalability in swarm mode
Flavio Crisciani 7 years ago
parent
commit
a5e7cfe3d7

+ 1 - 1
libnetwork/controller.go

@@ -871,7 +871,7 @@ addToStore:
 		}
 	}()
 
-	if len(network.loadBalancerIP) != 0 {
+	if network.hasLoadBalancerEndpoint() {
 		if err = network.createLoadBalancerSandbox(); err != nil {
 			return nil, err
 		}

+ 6 - 4
libnetwork/default_gateway.go

@@ -49,9 +49,11 @@ func (sb *sandbox) setupDefaultGW() error {
 
 	createOptions := []EndpointOption{CreateOptionAnonymous()}
 
-	eplen := gwEPlen
-	if len(sb.containerID) < gwEPlen {
-		eplen = len(sb.containerID)
+	var gwName string
+	if len(sb.containerID) <= gwEPlen {
+		gwName = "gateway_" + sb.containerID
+	} else {
+		gwName = "gateway_" + sb.id[:gwEPlen]
 	}
 
 	sbLabels := sb.Labels()
@@ -69,7 +71,7 @@ func (sb *sandbox) setupDefaultGW() error {
 		createOptions = append(createOptions, epOption)
 	}
 
-	newEp, err := n.CreateEndpoint("gateway_"+sb.containerID[0:eplen], createOptions...)
+	newEp, err := n.CreateEndpoint(gwName, createOptions...)
 	if err != nil {
 		return fmt.Errorf("container %s: endpoint create on GW Network failed: %v", sb.containerID, err)
 	}

+ 6 - 0
libnetwork/endpoint.go

@@ -540,6 +540,12 @@ func (ep *endpoint) sbJoin(sb *sandbox, options ...EndpointOption) (err error) {
 		}
 	}()
 
+	// Load balancing endpoints should never have a default gateway nor
+	// should they alter the status of a network's default gateway
+	if ep.loadBalancer && !sb.ingress {
+		return nil
+	}
+
 	if sb.needDefaultGW() && sb.getEndpointInGWNetwork() == nil {
 		return sb.setupDefaultGW()
 	}

+ 7 - 0
libnetwork/endpoint_info.go

@@ -49,6 +49,9 @@ type InterfaceInfo interface {
 
 	// LinkLocalAddresses returns the list of link-local (IPv4/IPv6) addresses assigned to the endpoint.
 	LinkLocalAddresses() []*net.IPNet
+
+	// SrcName returns the name of the interface w/in the container
+	SrcName() string
 }
 
 type endpointInterface struct {
@@ -272,6 +275,10 @@ func (epi *endpointInterface) LinkLocalAddresses() []*net.IPNet {
 	return epi.llAddrs
 }
 
+func (epi *endpointInterface) SrcName() string {
+	return epi.srcName
+}
+
 func (epi *endpointInterface) SetNames(srcName string, dstPrefix string) error {
 	epi.srcName = srcName
 	epi.dstPrefix = dstPrefix

+ 101 - 28
libnetwork/network.go

@@ -40,7 +40,7 @@ type Network interface {
 	CreateEndpoint(name string, options ...EndpointOption) (Endpoint, error)
 
 	// Delete the network.
-	Delete() error
+	Delete(options ...NetworkDeleteOption) error
 
 	// Endpoints returns the list of Endpoint(s) in this network.
 	Endpoints() []Endpoint
@@ -875,6 +875,28 @@ func (n *network) processOptions(options ...NetworkOption) {
 	}
 }
 
+type networkDeleteParams struct {
+	rmLBEndpoint bool
+}
+
+// NetworkDeleteOption is a type for optional parameters to pass to the
+// network.Delete() function.
+type NetworkDeleteOption func(p *networkDeleteParams)
+
+// NetworkDeleteOptionRemoveLB informs a network.Delete() operation that should
+// remove the load balancer endpoint for this network.  Note that the Delete()
+// method will automatically remove a load balancing endpoint for most networks
+// when the network is otherwise empty.  However, this does not occur for some
+// networks.  In particular, networks marked as ingress (which are supposed to
+// be more permanent than other overlay networks) won't automatically remove
+// the LB endpoint on Delete().  This method allows for explicit removal of
+// such networks provided there are no other endpoints present in the network.
+// If the network still has non-LB endpoints present, Delete() will not
+// remove the LB endpoint and will return an error.
+func NetworkDeleteOptionRemoveLB(p *networkDeleteParams) {
+	p.rmLBEndpoint = true
+}
+
 func (n *network) resolveDriver(name string, load bool) (driverapi.Driver, *driverapi.Capability, error) {
 	c := n.getController()
 
@@ -938,11 +960,23 @@ func (n *network) driver(load bool) (driverapi.Driver, error) {
 	return d, nil
 }
 
-func (n *network) Delete() error {
-	return n.delete(false)
+func (n *network) Delete(options ...NetworkDeleteOption) error {
+	var params networkDeleteParams
+	for _, opt := range options {
+		opt(&params)
+	}
+	return n.delete(false, params.rmLBEndpoint)
 }
 
-func (n *network) delete(force bool) error {
+// This function gets called in 3 ways:
+//  * Delete() -- (false, false)
+//      remove if endpoint count == 0 or endpoint count == 1 and
+//      there is a load balancer IP
+//  * Delete(libnetwork.NetworkDeleteOptionRemoveLB) -- (false, true)
+//      remove load balancer and network if endpoint count == 1
+//  * controller.networkCleanup() -- (true, true)
+//      remove the network no matter what
+func (n *network) delete(force bool, rmLBEndpoint bool) error {
 	n.Lock()
 	c := n.ctrlr
 	name := n.name
@@ -957,10 +991,32 @@ func (n *network) delete(force bool) error {
 		return &UnknownNetworkError{name: name, id: id}
 	}
 
-	if len(n.loadBalancerIP) != 0 {
-		endpoints := n.Endpoints()
-		if force || (len(endpoints) == 1 && !n.ingress) {
-			n.deleteLoadBalancerSandbox()
+	// Only remove ingress on force removal or explicit LB endpoint removal
+	if n.ingress && !force && !rmLBEndpoint {
+		return &ActiveEndpointsError{name: n.name, id: n.id}
+	}
+
+	// Check that the network is empty
+	var emptyCount uint64
+	if n.hasLoadBalancerEndpoint() {
+		emptyCount = 1
+	}
+	if !force && n.getEpCnt().EndpointCnt() > emptyCount {
+		if n.configOnly {
+			return types.ForbiddenErrorf("configuration network %q is in use", n.Name())
+		}
+		return &ActiveEndpointsError{name: n.name, id: n.id}
+	}
+
+	if n.hasLoadBalancerEndpoint() {
+		// If we got to this point, then the following must hold:
+		//  * force is true OR endpoint count == 1
+		if err := n.deleteLoadBalancerSandbox(); err != nil {
+			if !force {
+				return err
+			}
+			// continue deletion when force is true even on error
+			logrus.Warnf("Error deleting load balancer sandbox: %v", err)
 		}
 		//Reload the network from the store to update the epcnt.
 		n, err = c.getNetworkFromStore(id)
@@ -969,12 +1025,10 @@ func (n *network) delete(force bool) error {
 		}
 	}
 
-	if !force && n.getEpCnt().EndpointCnt() != 0 {
-		if n.configOnly {
-			return types.ForbiddenErrorf("configuration network %q is in use", n.Name())
-		}
-		return &ActiveEndpointsError{name: n.name, id: n.id}
-	}
+	// Up to this point, errors that we returned were recoverable.
+	// From here on, any errors leave us in an inconsistent state.
+	// This is unfortunate, but there isn't a safe way to
+	// reconstitute a load-balancer endpoint after removing it.
 
 	// Mark the network for deletion
 	n.inDelete = true
@@ -1023,9 +1077,6 @@ func (n *network) delete(force bool) error {
 	// Cleanup the service discovery for this network
 	c.cleanupServiceDiscovery(n.ID())
 
-	// Cleanup the load balancer
-	c.cleanupServiceBindings(n.ID())
-
 removeFromStore:
 	// deleteFromStore performs an atomic delete operation and the
 	// network.epCnt will help prevent any possible
@@ -1877,6 +1928,10 @@ func (n *network) hasSpecialDriver() bool {
 	return n.Type() == "host" || n.Type() == "null"
 }
 
+func (n *network) hasLoadBalancerEndpoint() bool {
+	return len(n.loadBalancerIP) != 0
+}
+
 func (n *network) ResolveName(req string, ipType int) ([]net.IP, bool) {
 	var ipv6Miss bool
 
@@ -2056,8 +2111,20 @@ func (c *controller) getConfigNetwork(name string) (*network, error) {
 	return n.(*network), nil
 }
 
-func (n *network) createLoadBalancerSandbox() error {
-	sandboxName := n.name + "-sbox"
+func (n *network) lbSandboxName() string {
+	name := "lb-" + n.name
+	if n.ingress {
+		name = n.name + "-sbox"
+	}
+	return name
+}
+
+func (n *network) lbEndpointName() string {
+	return n.name + "-endpoint"
+}
+
+func (n *network) createLoadBalancerSandbox() (retErr error) {
+	sandboxName := n.lbSandboxName()
 	sbOptions := []SandboxOption{}
 	if n.ingress {
 		sbOptions = append(sbOptions, OptionIngress())
@@ -2067,26 +2134,30 @@ func (n *network) createLoadBalancerSandbox() error {
 		return err
 	}
 	defer func() {
-		if err != nil {
+		if retErr != nil {
 			if e := n.ctrlr.SandboxDestroy(sandboxName); e != nil {
-				logrus.Warnf("could not delete sandbox %s on failure on failure (%v): %v", sandboxName, err, e)
+				logrus.Warnf("could not delete sandbox %s on failure on failure (%v): %v", sandboxName, retErr, e)
 			}
 		}
 	}()
 
-	endpointName := n.name + "-endpoint"
+	endpointName := n.lbEndpointName()
 	epOptions := []EndpointOption{
 		CreateOptionIpam(n.loadBalancerIP, nil, nil, nil),
 		CreateOptionLoadBalancer(),
 	}
+	if n.hasLoadBalancerEndpoint() && !n.ingress {
+		// Mark LB endpoints as anonymous so they don't show up in DNS
+		epOptions = append(epOptions, CreateOptionAnonymous())
+	}
 	ep, err := n.createEndpoint(endpointName, epOptions...)
 	if err != nil {
 		return err
 	}
 	defer func() {
-		if err != nil {
+		if retErr != nil {
 			if e := ep.Delete(true); e != nil {
-				logrus.Warnf("could not delete endpoint %s on failure on failure (%v): %v", endpointName, err, e)
+				logrus.Warnf("could not delete endpoint %s on failure on failure (%v): %v", endpointName, retErr, e)
 			}
 		}
 	}()
@@ -2094,17 +2165,18 @@ func (n *network) createLoadBalancerSandbox() error {
 	if err := ep.Join(sb, nil); err != nil {
 		return err
 	}
+
 	return sb.EnableService()
 }
 
-func (n *network) deleteLoadBalancerSandbox() {
+func (n *network) deleteLoadBalancerSandbox() error {
 	n.Lock()
 	c := n.ctrlr
 	name := n.name
 	n.Unlock()
 
-	endpointName := name + "-endpoint"
-	sandboxName := name + "-sbox"
+	sandboxName := n.lbSandboxName()
+	endpointName := n.lbEndpointName()
 
 	endpoint, err := n.EndpointByName(endpointName)
 	if err != nil {
@@ -2129,6 +2201,7 @@ func (n *network) deleteLoadBalancerSandbox() {
 	}
 
 	if err := c.SandboxDestroy(sandboxName); err != nil {
-		logrus.Warnf("Failed to delete %s sandbox: %v", sandboxName, err)
+		return fmt.Errorf("Failed to delete %s sandbox: %v", sandboxName, err)
 	}
+	return nil
 }

+ 8 - 4
libnetwork/osl/namespace_linux.go

@@ -358,16 +358,20 @@ func (n *networkNamespace) loopbackUp() error {
 	return n.nlHandle.LinkSetUp(iface)
 }
 
-func (n *networkNamespace) AddLoopbackAliasIP(ip *net.IPNet) error {
-	iface, err := n.nlHandle.LinkByName("lo")
+func (n *networkNamespace) GetLoopbackIfaceName() string {
+	return "lo"
+}
+
+func (n *networkNamespace) AddAliasIP(ifName string, ip *net.IPNet) error {
+	iface, err := n.nlHandle.LinkByName(ifName)
 	if err != nil {
 		return err
 	}
 	return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip})
 }
 
-func (n *networkNamespace) RemoveLoopbackAliasIP(ip *net.IPNet) error {
-	iface, err := n.nlHandle.LinkByName("lo")
+func (n *networkNamespace) RemoveAliasIP(ifName string, ip *net.IPNet) error {
+	iface, err := n.nlHandle.LinkByName(ifName)
 	if err != nil {
 		return err
 	}

+ 7 - 4
libnetwork/osl/sandbox.go

@@ -32,11 +32,14 @@ type Sandbox interface {
 	// Unset the previously set default IPv6 gateway in the sandbox
 	UnsetGatewayIPv6() error
 
-	// AddLoopbackAliasIP adds the passed IP address to the sandbox loopback interface
-	AddLoopbackAliasIP(ip *net.IPNet) error
+	// GetLoopbackIfaceName returns the name of the loopback interface
+	GetLoopbackIfaceName() string
 
-	// RemoveLoopbackAliasIP removes the passed IP address from the sandbox loopback interface
-	RemoveLoopbackAliasIP(ip *net.IPNet) error
+	// AddAliasIP adds the passed IP address to the named interface
+	AddAliasIP(ifName string, ip *net.IPNet) error
+
+	// RemoveAliasIP removes the passed IP address from the named interface
+	RemoveAliasIP(ifName string, ip *net.IPNet) error
 
 	// Add a static route to the sandbox.
 	AddStaticRoute(*types.StaticRoute) error

+ 1 - 15
libnetwork/sandbox.go

@@ -740,15 +740,8 @@ func releaseOSSboxResources(osSbox osl.Sandbox, ep *endpoint) {
 
 	ep.Lock()
 	joinInfo := ep.joinInfo
-	vip := ep.virtualIP
 	ep.Unlock()
 
-	if len(vip) != 0 {
-		if err := osSbox.RemoveLoopbackAliasIP(&net.IPNet{IP: vip, Mask: net.CIDRMask(32, 32)}); err != nil {
-			logrus.Warnf("Remove virtual IP %v failed: %v", vip, err)
-		}
-	}
-
 	if joinInfo == nil {
 		return
 	}
@@ -861,13 +854,6 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error {
 		}
 	}
 
-	if len(ep.virtualIP) != 0 {
-		err := sb.osSbox.AddLoopbackAliasIP(&net.IPNet{IP: ep.virtualIP, Mask: net.CIDRMask(32, 32)})
-		if err != nil {
-			return fmt.Errorf("failed to add virtual IP %v: %v", ep.virtualIP, err)
-		}
-	}
-
 	if joinInfo != nil {
 		// Set up non-interface routes.
 		for _, r := range joinInfo.StaticRoutes {
@@ -893,7 +879,7 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error {
 	// information including gateway and other routes so that
 	// loadbalancers are populated all the network state is in
 	// place in the sandbox.
-	sb.populateLoadbalancers(ep)
+	sb.populateLoadBalancers(ep)
 
 	// Only update the store if we did not come here as part of
 	// sandbox delete. If we came here as part of delete then do

+ 18 - 12
libnetwork/service_common.go

@@ -225,6 +225,13 @@ func makeServiceCleanupFunc(c *controller, s *service, nID, eID string, vip net.
 func (c *controller) addServiceBinding(svcName, svcID, nID, eID, containerName string, vip net.IP, ingressPorts []*PortConfig, serviceAliases, taskAliases []string, ip net.IP, method string) error {
 	var addService bool
 
+	// Failure to lock the network ID on add can result in racing
+	// racing against network deletion resulting in inconsistent
+	// state in the c.serviceBindings map and it's sub-maps. Also,
+	// always lock network ID before services to avoid deadlock.
+	c.networkLocker.Lock(nID)
+	defer c.networkLocker.Unlock(nID)
+
 	n, err := c.NetworkByID(nID)
 	if err != nil {
 		return err
@@ -289,11 +296,8 @@ func (c *controller) addServiceBinding(svcName, svcID, nID, eID, containerName s
 		logrus.Warnf("addServiceBinding %s possible transient state ok:%t entries:%d set:%t %s", eID, ok, entries, b, setStr)
 	}
 
-	// Add loadbalancer service and backend in all sandboxes in
-	// the network only if vip is valid.
-	if len(vip) != 0 {
-		n.(*network).addLBBackend(ip, vip, lb, ingressPorts)
-	}
+	// Add loadbalancer service and backend to the network
+	n.(*network).addLBBackend(ip, lb)
 
 	// Add the appropriate name resolutions
 	c.addEndpointNameResolution(svcName, svcID, nID, eID, containerName, vip, serviceAliases, taskAliases, ip, addService, "addServiceBinding")
@@ -307,11 +311,6 @@ func (c *controller) rmServiceBinding(svcName, svcID, nID, eID, containerName st
 
 	var rmService bool
 
-	n, err := c.NetworkByID(nID)
-	if err != nil {
-		return err
-	}
-
 	skey := serviceKey{
 		id:    svcID,
 		ports: portConfigs(ingressPorts).String(),
@@ -368,8 +367,15 @@ func (c *controller) rmServiceBinding(svcName, svcID, nID, eID, containerName st
 
 	// Remove loadbalancer service(if needed) and backend in all
 	// sandboxes in the network only if the vip is valid.
-	if len(vip) != 0 && entries == 0 {
-		n.(*network).rmLBBackend(ip, vip, lb, ingressPorts, rmService, fullRemove)
+	if entries == 0 {
+		// The network may well have been deleted before the last
+		// of the service bindings.  That's ok, because removing
+		// the network sandbox implicitly removes the backend
+		// service bindings.
+		n, err := c.NetworkByID(nID)
+		if err == nil {
+			n.(*network).rmLBBackend(ip, lb, rmService, fullRemove)
+		}
 	}
 
 	// Delete the name resolutions

+ 110 - 138
libnetwork/service_linux.go

@@ -30,40 +30,9 @@ func init() {
 	reexec.Register("redirecter", redirecter)
 }
 
-// Get all loadbalancers on this network that is currently discovered
-// on this node.
-func (n *network) connectedLoadbalancers() []*loadBalancer {
-	c := n.getController()
-
-	c.Lock()
-	serviceBindings := make([]*service, 0, len(c.serviceBindings))
-	for _, s := range c.serviceBindings {
-		serviceBindings = append(serviceBindings, s)
-	}
-	c.Unlock()
-
-	var lbs []*loadBalancer
-	for _, s := range serviceBindings {
-		s.Lock()
-		// Skip the serviceBindings that got deleted
-		if s.deleted {
-			s.Unlock()
-			continue
-		}
-		if lb, ok := s.loadBalancers[n.ID()]; ok {
-			lbs = append(lbs, lb)
-		}
-		s.Unlock()
-	}
-
-	return lbs
-}
-
 // Populate all loadbalancers on the network that the passed endpoint
 // belongs to, into this sandbox.
-func (sb *sandbox) populateLoadbalancers(ep *endpoint) {
-	var gwIP net.IP
-
+func (sb *sandbox) populateLoadBalancers(ep *endpoint) {
 	// This is an interface less endpoint. Nothing to do.
 	if ep.Iface() == nil {
 		return
@@ -77,96 +46,64 @@ func (sb *sandbox) populateLoadbalancers(ep *endpoint) {
 			logrus.Errorf("Failed to add redirect rules for ep %s (%s): %v", ep.Name(), ep.ID()[0:7], err)
 		}
 	}
+}
 
-	if sb.ingress {
-		// For the ingress sandbox if this is not gateway
-		// endpoint do nothing.
-		if ep != sb.getGatewayEndpoint() {
-			return
-		}
-
-		// This is the gateway endpoint. Now get the ingress
-		// network and plumb the loadbalancers.
-		gwIP = ep.Iface().Address().IP
-		for _, ep := range sb.getConnectedEndpoints() {
-			if !ep.endpointInGWNetwork() {
-				n = ep.getNetwork()
-				eIP = ep.Iface().Address()
-			}
+func (n *network) findLBEndpointSandbox() (*endpoint, *sandbox, error) {
+	// TODO: get endpoint from store?  See EndpointInfo()
+	var ep *endpoint
+	// Find this node's LB sandbox endpoint:  there should be exactly one
+	for _, e := range n.Endpoints() {
+		epi := e.Info()
+		if epi != nil && epi.LoadBalancer() {
+			ep = e.(*endpoint)
+			break
 		}
 	}
-
-	for _, lb := range n.connectedLoadbalancers() {
-		// Skip if vip is not valid.
-		if len(lb.vip) == 0 {
-			continue
-		}
-
-		lb.service.Lock()
-		for _, be := range lb.backEnds {
-			if !be.disabled {
-				sb.addLBBackend(be.ip, lb.vip, lb.fwMark, lb.service.ingressPorts, eIP, gwIP, n.ingress)
-			}
-		}
-		lb.service.Unlock()
+	if ep == nil {
+		return nil, nil, fmt.Errorf("Unable to find load balancing endpoint for network %s", n.ID())
 	}
+	// Get the load balancer sandbox itself as well
+	sb, ok := ep.getSandbox()
+	if !ok {
+		return nil, nil, fmt.Errorf("Unable to get sandbox for %s(%s) in for %s", ep.Name(), ep.ID(), n.ID())
+	}
+	ep = sb.getEndpoint(ep.ID())
+	if ep == nil {
+		return nil, nil, fmt.Errorf("Load balancing endpoint %s(%s) removed from %s", ep.Name(), ep.ID(), n.ID())
+	}
+	return ep, sb, nil
 }
 
-// Add loadbalancer backend to all sandboxes which has a connection to
-// this network. If needed add the service as well.
-func (n *network) addLBBackend(ip, vip net.IP, lb *loadBalancer, ingressPorts []*PortConfig) {
-	n.WalkEndpoints(func(e Endpoint) bool {
-		ep := e.(*endpoint)
-		if sb, ok := ep.getSandbox(); ok {
-			if !sb.isEndpointPopulated(ep) {
-				return false
-			}
-
-			var gwIP net.IP
-			if ep := sb.getGatewayEndpoint(); ep != nil {
-				gwIP = ep.Iface().Address().IP
-			}
-
-			sb.addLBBackend(ip, vip, lb.fwMark, ingressPorts, ep.Iface().Address(), gwIP, n.ingress)
-		}
-
-		return false
-	})
-}
-
-// Remove loadbalancer backend from all sandboxes which has a
-// connection to this network. If needed remove the service entry as
-// well, as specified by the rmService bool.
-func (n *network) rmLBBackend(ip, vip net.IP, lb *loadBalancer, ingressPorts []*PortConfig, rmService bool, fullRemove bool) {
-	n.WalkEndpoints(func(e Endpoint) bool {
-		ep := e.(*endpoint)
-		if sb, ok := ep.getSandbox(); ok {
-			if !sb.isEndpointPopulated(ep) {
-				return false
-			}
-
-			var gwIP net.IP
-			if ep := sb.getGatewayEndpoint(); ep != nil {
-				gwIP = ep.Iface().Address().IP
-			}
-
-			sb.rmLBBackend(ip, vip, lb.fwMark, ingressPorts, ep.Iface().Address(), gwIP, rmService, fullRemove, n.ingress)
+// Searches the OS sandbox for the name of the endpoint interface
+// within the sandbox.   This is required for adding/removing IP
+// aliases to the interface.
+func findIfaceDstName(sb *sandbox, ep *endpoint) string {
+	srcName := ep.Iface().SrcName()
+	for _, i := range sb.osSbox.Info().Interfaces() {
+		if i.SrcName() == srcName {
+			return i.DstName()
 		}
-
-		return false
-	})
+	}
+	return ""
 }
 
-// Add loadbalancer backend into one connected sandbox.
-func (sb *sandbox) addLBBackend(ip, vip net.IP, fwMark uint32, ingressPorts []*PortConfig, eIP *net.IPNet, gwIP net.IP, isIngressNetwork bool) {
-	if sb.osSbox == nil {
+// Add loadbalancer backend to the loadbalncer sandbox for the network.
+// If needed add the service as well.
+func (n *network) addLBBackend(ip net.IP, lb *loadBalancer) {
+	if len(lb.vip) == 0 {
 		return
 	}
-
-	if isIngressNetwork && !sb.ingress {
+	ep, sb, err := n.findLBEndpointSandbox()
+	if err != nil {
+		logrus.Errorf("addLBBackend %s/%s: %v", n.ID(), n.Name(), err)
+		return
+	}
+	if sb.osSbox == nil {
 		return
 	}
 
+	eIP := ep.Iface().Address()
+
 	i, err := ipvs.New(sb.Key())
 	if err != nil {
 		logrus.Errorf("Failed to create an ipvs handle for sbox %s (%s,%s) for lb addition: %v", sb.ID()[0:7], sb.ContainerID()[0:7], sb.Key(), err)
@@ -176,28 +113,43 @@ func (sb *sandbox) addLBBackend(ip, vip net.IP, fwMark uint32, ingressPorts []*P
 
 	s := &ipvs.Service{
 		AddressFamily: nl.FAMILY_V4,
-		FWMark:        fwMark,
+		FWMark:        lb.fwMark,
 		SchedName:     ipvs.RoundRobin,
 	}
 
 	if !i.IsServicePresent(s) {
-		var filteredPorts []*PortConfig
+		// Add IP alias for the VIP to the endpoint
+		ifName := findIfaceDstName(sb, ep)
+		if ifName == "" {
+			logrus.Errorf("Failed find interface name for endpoint %s(%s) to create LB alias", ep.ID(), ep.Name())
+			return
+		}
+		err := sb.osSbox.AddAliasIP(ifName, &net.IPNet{IP: lb.vip, Mask: net.CIDRMask(32, 32)})
+		if err != nil {
+			logrus.Errorf("Failed add IP alias %s to network %s LB endpoint interface %s: %v", lb.vip, n.ID(), ifName, err)
+			return
+		}
+
 		if sb.ingress {
-			filteredPorts = filterPortConfigs(ingressPorts, false)
+			var gwIP net.IP
+			if ep := sb.getGatewayEndpoint(); ep != nil {
+				gwIP = ep.Iface().Address().IP
+			}
+			filteredPorts := filterPortConfigs(lb.service.ingressPorts, false)
 			if err := programIngress(gwIP, filteredPorts, false); err != nil {
 				logrus.Errorf("Failed to add ingress: %v", err)
 				return
 			}
 		}
 
-		logrus.Debugf("Creating service for vip %s fwMark %d ingressPorts %#v in sbox %s (%s)", vip, fwMark, ingressPorts, sb.ID()[0:7], sb.ContainerID()[0:7])
-		if err := invokeFWMarker(sb.Key(), vip, fwMark, ingressPorts, eIP, false); err != nil {
+		logrus.Debugf("Creating service for vip %s fwMark %d ingressPorts %#v in sbox %s (%s)", lb.vip, lb.fwMark, lb.service.ingressPorts, sb.ID()[0:7], sb.ContainerID()[0:7])
+		if err := invokeFWMarker(sb.Key(), lb.vip, lb.fwMark, lb.service.ingressPorts, eIP, false); err != nil {
 			logrus.Errorf("Failed to add firewall mark rule in sbox %s (%s): %v", sb.ID()[0:7], sb.ContainerID()[0:7], err)
 			return
 		}
 
 		if err := i.NewService(s); err != nil && err != syscall.EEXIST {
-			logrus.Errorf("Failed to create a new service for vip %s fwmark %d in sbox %s (%s): %v", vip, fwMark, sb.ID()[0:7], sb.ContainerID()[0:7], err)
+			logrus.Errorf("Failed to create a new service for vip %s fwmark %d in sbox %s (%s): %v", lb.vip, lb.fwMark, sb.ID()[0:7], sb.ContainerID()[0:7], err)
 			return
 		}
 	}
@@ -212,20 +164,29 @@ func (sb *sandbox) addLBBackend(ip, vip net.IP, fwMark uint32, ingressPorts []*P
 	// destination.
 	s.SchedName = ""
 	if err := i.NewDestination(s, d); err != nil && err != syscall.EEXIST {
-		logrus.Errorf("Failed to create real server %s for vip %s fwmark %d in sbox %s (%s): %v", ip, vip, fwMark, sb.ID()[0:7], sb.ContainerID()[0:7], err)
+		logrus.Errorf("Failed to create real server %s for vip %s fwmark %d in sbox %s (%s): %v", ip, lb.vip, lb.fwMark, sb.ID()[0:7], sb.ContainerID()[0:7], err)
 	}
 }
 
-// Remove loadbalancer backend from one connected sandbox.
-func (sb *sandbox) rmLBBackend(ip, vip net.IP, fwMark uint32, ingressPorts []*PortConfig, eIP *net.IPNet, gwIP net.IP, rmService bool, fullRemove bool, isIngressNetwork bool) {
-	if sb.osSbox == nil {
+// Remove loadbalancer backend the load balancing endpoint for this
+// network. If 'rmService' is true, then remove the service entry as well.
+// If 'fullRemove' is true then completely remove the entry, otherwise
+// just deweight it for now.
+func (n *network) rmLBBackend(ip net.IP, lb *loadBalancer, rmService bool, fullRemove bool) {
+	if len(lb.vip) == 0 {
 		return
 	}
-
-	if isIngressNetwork && !sb.ingress {
+	ep, sb, err := n.findLBEndpointSandbox()
+	if err != nil {
+		logrus.Debugf("rmLBBackend for %s/%s: %v -- probably transient state", n.ID(), n.Name(), err)
+		return
+	}
+	if sb.osSbox == nil {
 		return
 	}
 
+	eIP := ep.Iface().Address()
+
 	i, err := ipvs.New(sb.Key())
 	if err != nil {
 		logrus.Errorf("Failed to create an ipvs handle for sbox %s (%s,%s) for lb removal: %v", sb.ID()[0:7], sb.ContainerID()[0:7], sb.Key(), err)
@@ -235,7 +196,7 @@ func (sb *sandbox) rmLBBackend(ip, vip net.IP, fwMark uint32, ingressPorts []*Po
 
 	s := &ipvs.Service{
 		AddressFamily: nl.FAMILY_V4,
-		FWMark:        fwMark,
+		FWMark:        lb.fwMark,
 	}
 
 	d := &ipvs.Destination{
@@ -246,32 +207,46 @@ func (sb *sandbox) rmLBBackend(ip, vip net.IP, fwMark uint32, ingressPorts []*Po
 
 	if fullRemove {
 		if err := i.DelDestination(s, d); err != nil && err != syscall.ENOENT {
-			logrus.Errorf("Failed to delete real server %s for vip %s fwmark %d in sbox %s (%s): %v", ip, vip, fwMark, sb.ID()[0:7], sb.ContainerID()[0:7], err)
+			logrus.Errorf("Failed to delete real server %s for vip %s fwmark %d in sbox %s (%s): %v", ip, lb.vip, lb.fwMark, sb.ID()[0:7], sb.ContainerID()[0:7], err)
 		}
 	} else {
 		d.Weight = 0
 		if err := i.UpdateDestination(s, d); err != nil && err != syscall.ENOENT {
-			logrus.Errorf("Failed to set LB weight of real server %s to 0 for vip %s fwmark %d in sbox %s (%s): %v", ip, vip, fwMark, sb.ID()[0:7], sb.ContainerID()[0:7], err)
+			logrus.Errorf("Failed to set LB weight of real server %s to 0 for vip %s fwmark %d in sbox %s (%s): %v", ip, lb.vip, lb.fwMark, sb.ID()[0:7], sb.ContainerID()[0:7], err)
 		}
 	}
 
 	if rmService {
 		s.SchedName = ipvs.RoundRobin
 		if err := i.DelService(s); err != nil && err != syscall.ENOENT {
-			logrus.Errorf("Failed to delete service for vip %s fwmark %d in sbox %s (%s): %v", vip, fwMark, sb.ID()[0:7], sb.ContainerID()[0:7], err)
+			logrus.Errorf("Failed to delete service for vip %s fwmark %d in sbox %s (%s): %v", lb.vip, lb.fwMark, sb.ID()[0:7], sb.ContainerID()[0:7], err)
 		}
 
-		var filteredPorts []*PortConfig
 		if sb.ingress {
-			filteredPorts = filterPortConfigs(ingressPorts, true)
+			var gwIP net.IP
+			if ep := sb.getGatewayEndpoint(); ep != nil {
+				gwIP = ep.Iface().Address().IP
+			}
+			filteredPorts := filterPortConfigs(lb.service.ingressPorts, true)
 			if err := programIngress(gwIP, filteredPorts, true); err != nil {
 				logrus.Errorf("Failed to delete ingress: %v", err)
 			}
 		}
 
-		if err := invokeFWMarker(sb.Key(), vip, fwMark, ingressPorts, eIP, true); err != nil {
+		if err := invokeFWMarker(sb.Key(), lb.vip, lb.fwMark, lb.service.ingressPorts, eIP, true); err != nil {
 			logrus.Errorf("Failed to delete firewall mark rule in sbox %s (%s): %v", sb.ID()[0:7], sb.ContainerID()[0:7], err)
 		}
+
+		// Remove IP alias from the VIP to the endpoint
+		ifName := findIfaceDstName(sb, ep)
+		if ifName == "" {
+			logrus.Errorf("Failed find interface name for endpoint %s(%s) to create LB alias", ep.ID(), ep.Name())
+			return
+		}
+		err := sb.osSbox.RemoveAliasIP(ifName, &net.IPNet{IP: lb.vip, Mask: net.CIDRMask(32, 32)})
+		if err != nil {
+			logrus.Errorf("Failed add IP alias %s to network %s LB endpoint interface %s: %v", lb.vip, n.ID(), ifName, err)
+		}
 	}
 }
 
@@ -617,7 +592,7 @@ func fwMarker() {
 		ingressPorts, err = readPortsFromFile(os.Args[5])
 		if err != nil {
 			logrus.Errorf("Failed reading ingress ports file: %v", err)
-			os.Exit(6)
+			os.Exit(2)
 		}
 	}
 
@@ -625,7 +600,7 @@ func fwMarker() {
 	fwMark, err := strconv.ParseUint(os.Args[3], 10, 32)
 	if err != nil {
 		logrus.Errorf("bad fwmark value(%s) passed: %v", os.Args[3], err)
-		os.Exit(2)
+		os.Exit(3)
 	}
 	addDelOpt := os.Args[4]
 
@@ -639,20 +614,20 @@ func fwMarker() {
 	ns, err := netns.GetFromPath(os.Args[1])
 	if err != nil {
 		logrus.Errorf("failed get network namespace %q: %v", os.Args[1], err)
-		os.Exit(3)
+		os.Exit(4)
 	}
 	defer ns.Close()
 
 	if err := netns.Set(ns); err != nil {
 		logrus.Errorf("setting into container net ns %v failed, %v", os.Args[1], err)
-		os.Exit(4)
+		os.Exit(5)
 	}
 
 	if addDelOpt == "-A" {
 		eIP, subnet, err := net.ParseCIDR(os.Args[6])
 		if err != nil {
 			logrus.Errorf("Failed to parse endpoint IP %s: %v", os.Args[6], err)
-			os.Exit(9)
+			os.Exit(6)
 		}
 
 		ruleParams := strings.Fields(fmt.Sprintf("-m ipvs --ipvs -d %s -j SNAT --to-source %s", subnet, eIP))
@@ -663,21 +638,18 @@ func fwMarker() {
 			err := ioutil.WriteFile("/proc/sys/net/ipv4/vs/conntrack", []byte{'1', '\n'}, 0644)
 			if err != nil {
 				logrus.Errorf("Failed to write to /proc/sys/net/ipv4/vs/conntrack: %v", err)
-				os.Exit(8)
+				os.Exit(7)
 			}
 		}
 	}
 
-	rule := strings.Fields(fmt.Sprintf("-t mangle %s OUTPUT -d %s/32 -j MARK --set-mark %d", addDelOpt, vip, fwMark))
-	rules = append(rules, rule)
-
-	rule = strings.Fields(fmt.Sprintf("-t nat %s OUTPUT -p icmp --icmp echo-request -d %s -j DNAT --to 127.0.0.1", addDelOpt, vip))
+	rule := strings.Fields(fmt.Sprintf("-t mangle %s INPUT -d %s/32 -j MARK --set-mark %d", addDelOpt, vip, fwMark))
 	rules = append(rules, rule)
 
 	for _, rule := range rules {
 		if err := iptables.RawCombinedOutputNative(rule...); err != nil {
 			logrus.Errorf("setting up rule failed, %v: %v", rule, err)
-			os.Exit(5)
+			os.Exit(8)
 		}
 	}
 }

+ 1 - 1
libnetwork/service_unsupported.go

@@ -18,7 +18,7 @@ func (c *controller) rmServiceBinding(name, sid, nid, eid string, vip net.IP, in
 	return fmt.Errorf("not supported")
 }
 
-func (sb *sandbox) populateLoadbalancers(ep *endpoint) {
+func (sb *sandbox) populateLoadBalancers(ep *endpoint) {
 }
 
 func arrangeIngressFilterRule() {

+ 14 - 4
libnetwork/service_windows.go

@@ -19,7 +19,13 @@ func init() {
 	lbPolicylistMap = make(map[*loadBalancer]*policyLists)
 }
 
-func (n *network) addLBBackend(ip, vip net.IP, lb *loadBalancer, ingressPorts []*PortConfig) {
+func (n *network) addLBBackend(ip net.IP, lb *loadBalancer) {
+	if len(lb.vip) == 0 {
+		return
+	}
+
+	vip := lb.vip
+	ingressPorts := lb.service.ingressPorts
 
 	if system.GetOSVersion().Build > 16236 {
 		lb.Lock()
@@ -117,11 +123,15 @@ func (n *network) addLBBackend(ip, vip net.IP, lb *loadBalancer, ingressPorts []
 	}
 }
 
-func (n *network) rmLBBackend(ip, vip net.IP, lb *loadBalancer, ingressPorts []*PortConfig, rmService bool, fullRemove bool) {
+func (n *network) rmLBBackend(ip net.IP, lb *loadBalancer, rmService bool, fullRemove bool) {
+	if len(lb.vip) == 0 {
+		return
+	}
+
 	if system.GetOSVersion().Build > 16236 {
 		if numEnabledBackends(lb) > 0 {
 			//Reprogram HNS (actually VFP) with the existing backends.
-			n.addLBBackend(ip, vip, lb, ingressPorts)
+			n.addLBBackend(ip, lb)
 		} else {
 			lb.Lock()
 			defer lb.Unlock()
@@ -156,7 +166,7 @@ func numEnabledBackends(lb *loadBalancer) int {
 	return nEnabled
 }
 
-func (sb *sandbox) populateLoadbalancers(ep *endpoint) {
+func (sb *sandbox) populateLoadBalancers(ep *endpoint) {
 }
 
 func arrangeIngressFilterRule() {

+ 1 - 1
libnetwork/store.go

@@ -479,7 +479,7 @@ func (c *controller) networkCleanup() {
 	for _, n := range networks {
 		if n.inDelete {
 			logrus.Infof("Removing stale network %s (%s)", n.Name(), n.ID())
-			if err := n.delete(true); err != nil {
+			if err := n.delete(true, true); err != nil {
 				logrus.Debugf("Error while removing stale network: %v", err)
 			}
 		}