Browse Source

Vendoring libnetwork and swarmkit

Signed-off-by: Abhinandan Prativadi <abhi@docker.com>
Abhinandan Prativadi 7 years ago
parent
commit
2083121b09
30 changed files with 450 additions and 290 deletions
  1. 2 2
      integration-cli/docker_cli_swarm_test.go
  2. 2 2
      vendor.conf
  3. 0 6
      vendor/github.com/docker/libnetwork/agent.go
  4. 39 10
      vendor/github.com/docker/libnetwork/bitseq/sequence.go
  5. 1 0
      vendor/github.com/docker/libnetwork/bitseq/store.go
  6. 0 1
      vendor/github.com/docker/libnetwork/drivers/overlay/encryption.go
  7. 8 10
      vendor/github.com/docker/libnetwork/drivers/overlay/joinleave.go
  8. 19 51
      vendor/github.com/docker/libnetwork/drivers/overlay/ov_network.go
  9. 4 4
      vendor/github.com/docker/libnetwork/drivers/overlay/ov_serf.go
  10. 1 1
      vendor/github.com/docker/libnetwork/drivers/overlay/overlay.go
  11. 1 1
      vendor/github.com/docker/libnetwork/drivers/overlay/overlay.proto
  12. 1 1
      vendor/github.com/docker/libnetwork/drivers/overlay/ovmanager/ovmanager.go
  13. 158 78
      vendor/github.com/docker/libnetwork/drivers/overlay/peerdb.go
  14. 1 1
      vendor/github.com/docker/libnetwork/drivers/solaris/overlay/ov_network.go
  15. 28 8
      vendor/github.com/docker/libnetwork/drivers/windows/windows.go
  16. 4 4
      vendor/github.com/docker/libnetwork/idm/idm.go
  17. 12 4
      vendor/github.com/docker/libnetwork/ipam/allocator.go
  18. 10 0
      vendor/github.com/docker/libnetwork/ipamapi/labels.go
  19. 1 1
      vendor/github.com/docker/libnetwork/iptables/iptables.go
  20. 6 6
      vendor/github.com/docker/libnetwork/networkdb/broadcast.go
  21. 14 12
      vendor/github.com/docker/libnetwork/networkdb/cluster.go
  22. 8 32
      vendor/github.com/docker/libnetwork/networkdb/delegate.go
  23. 33 22
      vendor/github.com/docker/libnetwork/networkdb/networkdb.go
  24. 1 6
      vendor/github.com/docker/libnetwork/osl/namespace_windows.go
  25. 26 11
      vendor/github.com/docker/libnetwork/osl/neigh_linux.go
  26. 33 3
      vendor/github.com/docker/swarmkit/manager/allocator/cnmallocator/networkallocator.go
  27. 1 1
      vendor/github.com/docker/swarmkit/manager/allocator/cnmallocator/portallocator.go
  28. 34 9
      vendor/github.com/docker/swarmkit/manager/state/raft/raft.go
  29. 1 2
      vendor/github.com/docker/swarmkit/manager/state/store/memory.go
  30. 1 1
      vendor/github.com/docker/swarmkit/vendor.conf

+ 2 - 2
integration-cli/docker_cli_swarm_test.go

@@ -1546,7 +1546,7 @@ func (s *DockerSwarmSuite) TestSwarmNetworkIPAMOptions(c *check.C) {
 	out, err = d.Cmd("network", "inspect", "--format", "{{.IPAM.Options}}", "foo")
 	c.Assert(err, checker.IsNil, check.Commentf(out))
 	c.Assert(strings.TrimSpace(out), checker.Contains, "foo:bar")
-        c.Assert(strings.TrimSpace(out), checker.Contains, "com.docker.network.ipam.serial:true")
+	c.Assert(strings.TrimSpace(out), checker.Contains, "com.docker.network.ipam.serial:true")
 
 	out, err = d.Cmd("service", "create", "--detach", "--no-resolve-image", "--network=foo", "--name", "top", "busybox", "top")
 	c.Assert(err, checker.IsNil, check.Commentf(out))
@@ -1557,7 +1557,7 @@ func (s *DockerSwarmSuite) TestSwarmNetworkIPAMOptions(c *check.C) {
 	out, err = d.Cmd("network", "inspect", "--format", "{{.IPAM.Options}}", "foo")
 	c.Assert(err, checker.IsNil, check.Commentf(out))
 	c.Assert(strings.TrimSpace(out), checker.Contains, "foo:bar")
-        c.Assert(strings.TrimSpace(out), checker.Contains, "com.docker.network.ipam.serial:true")
+	c.Assert(strings.TrimSpace(out), checker.Contains, "com.docker.network.ipam.serial:true")
 }
 
 func (s *DockerTrustedSwarmSuite) TestTrustedServiceCreate(c *check.C) {

+ 2 - 2
vendor.conf

@@ -31,7 +31,7 @@ github.com/moby/buildkit aaff9d591ef128560018433fe61beb802e149de8
 github.com/tonistiigi/fsutil dea3a0da73aee887fc02142d995be764106ac5e2
 
 #get libnetwork packages
-github.com/docker/libnetwork 0f08d31bf0e640e0cdc6d5161227f87602d605c5
+github.com/docker/libnetwork 68f1039f172434709a4550fe92e3e058406c74ce 
 github.com/docker/go-events 9461782956ad83b30282bf90e31fa6a70c255ba9
 github.com/armon/go-radix e39d623f12e8e41c7b5529e9a9dd67a1e2261f80
 github.com/armon/go-metrics eb0af217e5e9747e41dd5303755356b62d28e3ec
@@ -109,7 +109,7 @@ github.com/containerd/containerd 06b9cb35161009dcb7123345749fef02f7cea8e0
 github.com/tonistiigi/fifo 1405643975692217d6720f8b54aeee1bf2cd5cf4
 
 # cluster
-github.com/docker/swarmkit 941a01844b89c56aa61086fecb167ab3af1de22b
+github.com/docker/swarmkit 872861d2ae46958af7ead1d5fffb092c73afbaf0
 github.com/gogo/protobuf v0.4
 github.com/cloudflare/cfssl 7fb22c8cba7ecaf98e4082d22d65800cf45e042a
 github.com/google/certificate-transparency d90e65c3a07988180c5b1ece71791c0b6506826e

+ 0 - 6
vendor/github.com/docker/libnetwork/agent.go

@@ -6,11 +6,9 @@ import (
 	"encoding/json"
 	"fmt"
 	"net"
-	"os"
 	"sort"
 	"sync"
 
-	"github.com/docker/docker/pkg/stringid"
 	"github.com/docker/go-events"
 	"github.com/docker/libnetwork/cluster"
 	"github.com/docker/libnetwork/datastore"
@@ -282,12 +280,8 @@ func (c *controller) agentInit(listenAddr, bindAddrOrInterface, advertiseAddr, d
 	}
 
 	keys, _ := c.getKeys(subsysGossip)
-	hostname, _ := os.Hostname()
-	nodeName := hostname + "-" + stringid.TruncateID(stringid.GenerateRandomID())
-	logrus.Info("Gossip cluster hostname ", nodeName)
 
 	netDBConf := networkdb.DefaultConfig()
-	netDBConf.NodeName = nodeName
 	netDBConf.BindAddr = listenAddr
 	netDBConf.AdvertiseAddr = advertiseAddr
 	netDBConf.Keys = keys

+ 39 - 10
vendor/github.com/docker/libnetwork/bitseq/sequence.go

@@ -41,6 +41,7 @@ type Handle struct {
 	id         string
 	dbIndex    uint64
 	dbExists   bool
+	curr       uint64
 	store      datastore.DataStore
 	sync.Mutex
 }
@@ -193,26 +194,27 @@ func (h *Handle) getCopy() *Handle {
 		dbIndex:    h.dbIndex,
 		dbExists:   h.dbExists,
 		store:      h.store,
+		curr:       h.curr,
 	}
 }
 
 // SetAnyInRange atomically sets the first unset bit in the specified range in the sequence and returns the corresponding ordinal
-func (h *Handle) SetAnyInRange(start, end uint64) (uint64, error) {
+func (h *Handle) SetAnyInRange(start, end uint64, serial bool) (uint64, error) {
 	if end < start || end >= h.bits {
 		return invalidPos, fmt.Errorf("invalid bit range [%d, %d]", start, end)
 	}
 	if h.Unselected() == 0 {
 		return invalidPos, ErrNoBitAvailable
 	}
-	return h.set(0, start, end, true, false)
+	return h.set(0, start, end, true, false, serial)
 }
 
 // SetAny atomically sets the first unset bit in the sequence and returns the corresponding ordinal
-func (h *Handle) SetAny() (uint64, error) {
+func (h *Handle) SetAny(serial bool) (uint64, error) {
 	if h.Unselected() == 0 {
 		return invalidPos, ErrNoBitAvailable
 	}
-	return h.set(0, 0, h.bits-1, true, false)
+	return h.set(0, 0, h.bits-1, true, false, serial)
 }
 
 // Set atomically sets the corresponding bit in the sequence
@@ -220,7 +222,7 @@ func (h *Handle) Set(ordinal uint64) error {
 	if err := h.validateOrdinal(ordinal); err != nil {
 		return err
 	}
-	_, err := h.set(ordinal, 0, 0, false, false)
+	_, err := h.set(ordinal, 0, 0, false, false, false)
 	return err
 }
 
@@ -229,7 +231,7 @@ func (h *Handle) Unset(ordinal uint64) error {
 	if err := h.validateOrdinal(ordinal); err != nil {
 		return err
 	}
-	_, err := h.set(ordinal, 0, 0, false, true)
+	_, err := h.set(ordinal, 0, 0, false, true, false)
 	return err
 }
 
@@ -298,7 +300,7 @@ func (h *Handle) CheckConsistency() error {
 }
 
 // set/reset the bit
-func (h *Handle) set(ordinal, start, end uint64, any bool, release bool) (uint64, error) {
+func (h *Handle) set(ordinal, start, end uint64, any bool, release bool, serial bool) (uint64, error) {
 	var (
 		bitPos  uint64
 		bytePos uint64
@@ -308,6 +310,7 @@ func (h *Handle) set(ordinal, start, end uint64, any bool, release bool) (uint64
 
 	for {
 		var store datastore.DataStore
+		curr := uint64(0)
 		h.Lock()
 		store = h.store
 		h.Unlock()
@@ -318,15 +321,18 @@ func (h *Handle) set(ordinal, start, end uint64, any bool, release bool) (uint64
 		}
 
 		h.Lock()
+		if serial {
+			curr = h.curr
+		}
 		// Get position if available
 		if release {
 			bytePos, bitPos = ordinalToPos(ordinal)
 		} else {
 			if any {
-				bytePos, bitPos, err = getFirstAvailable(h.head, start)
+				bytePos, bitPos, err = getAvailableFromCurrent(h.head, start, curr, end)
 				ret = posToOrdinal(bytePos, bitPos)
-				if end < ret {
-					err = ErrNoBitAvailable
+				if err == nil {
+					h.curr = ret + 1
 				}
 			} else {
 				bytePos, bitPos, err = checkIfAvailable(h.head, ordinal)
@@ -515,6 +521,29 @@ func getFirstAvailable(head *sequence, start uint64) (uint64, uint64, error) {
 	return invalidPos, invalidPos, ErrNoBitAvailable
 }
 
+// getAvailableFromCurrent will look for available ordinal from the current ordinal.
+// If none found then it will loop back to the start to check of the available bit.
+// This can be further optimized to check from start till curr in case of a rollover
+func getAvailableFromCurrent(head *sequence, start, curr, end uint64) (uint64, uint64, error) {
+	var bytePos, bitPos uint64
+	if curr != 0 && curr > start {
+		bytePos, bitPos, _ = getFirstAvailable(head, curr)
+		ret := posToOrdinal(bytePos, bitPos)
+		if end < ret {
+			goto begin
+		}
+		return bytePos, bitPos, nil
+	}
+
+begin:
+	bytePos, bitPos, _ = getFirstAvailable(head, start)
+	ret := posToOrdinal(bytePos, bitPos)
+	if end < ret {
+		return invalidPos, invalidPos, ErrNoBitAvailable
+	}
+	return bytePos, bitPos, nil
+}
+
 // checkIfAvailable checks if the bit correspondent to the specified ordinal is unset
 // If the ordinal is beyond the sequence limits, a negative response is returned
 func checkIfAvailable(head *sequence, ordinal uint64) (uint64, uint64, error) {

+ 1 - 0
vendor/github.com/docker/libnetwork/bitseq/store.go

@@ -87,6 +87,7 @@ func (h *Handle) CopyTo(o datastore.KVObject) error {
 	dstH.dbIndex = h.dbIndex
 	dstH.dbExists = h.dbExists
 	dstH.store = h.store
+	dstH.curr = h.curr
 	dstH.Unlock()
 
 	return nil

+ 0 - 1
vendor/github.com/docker/libnetwork/drivers/overlay/encryption.go

@@ -21,7 +21,6 @@ import (
 
 const (
 	r            = 0xD0C4E3
-	timeout      = 30
 	pktExpansion = 26 // SPI(4) + SeqN(4) + IV(8) + PadLength(1) + NextHeader(1) + ICV(8)
 )
 

+ 8 - 10
vendor/github.com/docker/libnetwork/drivers/overlay/joinleave.go

@@ -68,7 +68,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
 
 	ep.ifName = containerIfName
 
-	if err := d.writeEndpointToStore(ep); err != nil {
+	if err = d.writeEndpointToStore(ep); err != nil {
 		return fmt.Errorf("failed to update overlay endpoint %s to local data store: %v", ep.id[0:7], err)
 	}
 
@@ -86,7 +86,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
 		return err
 	}
 
-	if err := sbox.AddInterface(overlayIfName, "veth",
+	if err = sbox.AddInterface(overlayIfName, "veth",
 		sbox.InterfaceOptions().Master(s.brName)); err != nil {
 		return fmt.Errorf("could not add veth pair inside the network sandbox: %v", err)
 	}
@@ -100,7 +100,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
 		return err
 	}
 
-	if err := nlh.LinkSetHardwareAddr(veth, ep.mac); err != nil {
+	if err = nlh.LinkSetHardwareAddr(veth, ep.mac); err != nil {
 		return fmt.Errorf("could not set mac address (%v) to the container interface: %v", ep.mac, err)
 	}
 
@@ -108,7 +108,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
 		if sub == s {
 			continue
 		}
-		if err := jinfo.AddStaticRoute(sub.subnetIP, types.NEXTHOP, s.gwIP.IP); err != nil {
+		if err = jinfo.AddStaticRoute(sub.subnetIP, types.NEXTHOP, s.gwIP.IP); err != nil {
 			logrus.Errorf("Adding subnet %s static route in network %q failed\n", s.subnetIP, n.id)
 		}
 	}
@@ -122,7 +122,7 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
 
 	d.peerAdd(nid, eid, ep.addr.IP, ep.addr.Mask, ep.mac, net.ParseIP(d.advertiseAddress), false, false, true)
 
-	if err := d.checkEncryption(nid, nil, n.vxlanID(s), true, true); err != nil {
+	if err = d.checkEncryption(nid, nil, n.vxlanID(s), true, true); err != nil {
 		logrus.Warn(err)
 	}
 
@@ -200,7 +200,7 @@ func (d *driver) EventNotify(etype driverapi.EventType, nid, tableName, key stri
 	}
 
 	if etype == driverapi.Delete {
-		d.peerDelete(nid, eid, addr.IP, addr.Mask, mac, vtep)
+		d.peerDelete(nid, eid, addr.IP, addr.Mask, mac, vtep, false)
 		return
 	}
 
@@ -232,11 +232,9 @@ func (d *driver) Leave(nid, eid string) error {
 		}
 	}
 
-	n.leaveSandbox()
+	d.peerDelete(nid, eid, ep.addr.IP, ep.addr.Mask, ep.mac, net.ParseIP(d.advertiseAddress), true)
 
-	if err := d.checkEncryption(nid, nil, 0, true, false); err != nil {
-		logrus.Warn(err)
-	}
+	n.leaveSandbox()
 
 	return nil
 }

+ 19 - 51
vendor/github.com/docker/libnetwork/drivers/overlay/ov_network.go

@@ -119,7 +119,7 @@ func setDefaultVlan() {
 	data := []byte{'0', '\n'}
 
 	if err = ioutil.WriteFile(path, data, 0644); err != nil {
-		logrus.Errorf("endbling default vlan on bridge %s failed %v", brName, err)
+		logrus.Errorf("enabling default vlan on bridge %s failed %v", brName, err)
 		os.Exit(1)
 	}
 	os.Exit(0)
@@ -251,8 +251,9 @@ func (d *driver) DeleteNetwork(nid string) error {
 		if err := d.deleteEndpointFromStore(ep); err != nil {
 			logrus.Warnf("Failed to delete overlay endpoint %s from local store: %v", ep.id[0:7], err)
 		}
-
 	}
+	// flush the peerDB entries
+	d.peerFlush(nid)
 	d.deleteNetwork(nid)
 
 	vnis, err := n.releaseVxlanID()
@@ -505,11 +506,7 @@ func (n *network) restoreSubnetSandbox(s *subnet, brName, vxlanName string) erro
 	vxlanIfaceOption := make([]osl.IfaceOption, 1)
 	vxlanIfaceOption = append(vxlanIfaceOption, sbox.InterfaceOptions().Master(brName))
 	Ifaces[vxlanName+"+vxlan"] = vxlanIfaceOption
-	err = sbox.Restore(Ifaces, nil, nil, nil)
-	if err != nil {
-		return err
-	}
-	return nil
+	return sbox.Restore(Ifaces, nil, nil, nil)
 }
 
 func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error {
@@ -760,58 +757,38 @@ func (n *network) watchMiss(nlSock *nl.NetlinkSocket) {
 				continue
 			}
 
-			logrus.Debugf("miss notification: dest IP %v, dest MAC %v", ip, mac)
-
 			if neigh.State&(netlink.NUD_STALE|netlink.NUD_INCOMPLETE) == 0 {
 				continue
 			}
 
 			if n.driver.isSerfAlive() {
+				logrus.Debugf("miss notification: dest IP %v, dest MAC %v", ip, mac)
 				mac, IPmask, vtep, err := n.driver.resolvePeer(n.id, ip)
 				if err != nil {
 					logrus.Errorf("could not resolve peer %q: %v", ip, err)
 					continue
 				}
 				n.driver.peerAdd(n.id, "dummy", ip, IPmask, mac, vtep, l2Miss, l3Miss, false)
-			} else {
-				// If the gc_thresh values are lower kernel might knock off the neighor entries.
-				// When we get a L3 miss check if its a valid peer and reprogram the neighbor
-				// entry again. Rate limit it to once attempt every 500ms, just in case a faulty
-				// container sends a flood of packets to invalid peers
-				if !l3Miss {
-					continue
-				}
-				if time.Since(t) > 500*time.Millisecond {
+			} else if l3Miss && time.Since(t) > time.Second {
+				// All the local peers will trigger a miss notification but this one is expected and the local container will reply
+				// autonomously to the ARP request
+				// In case the gc_thresh3 values is low kernel might reject new entries during peerAdd. This will trigger the following
+				// extra logs that will inform of the possible issue.
+				// Entries created would not be deleted see documentation http://man7.org/linux/man-pages/man7/arp.7.html:
+				// Entries which are marked as permanent are never deleted by the garbage-collector.
+				// The time limit here is to guarantee that the dbSearch is not
+				// done too frequently causing a stall of the peerDB operations.
+				pKey, pEntry, err := n.driver.peerDbSearch(n.id, ip)
+				if err == nil && !pEntry.isLocal {
 					t = time.Now()
-					n.programNeighbor(ip)
+					logrus.Warnf("miss notification for peer:%+v l3Miss:%t l2Miss:%t, if the problem persist check the gc_thresh on the host pKey:%+v pEntry:%+v err:%v",
+						neigh, l3Miss, l2Miss, *pKey, *pEntry, err)
 				}
 			}
 		}
 	}
 }
 
-func (n *network) programNeighbor(ip net.IP) {
-	peerMac, _, _, err := n.driver.peerDbSearch(n.id, ip)
-	if err != nil {
-		logrus.Errorf("Reprogramming on L3 miss failed for %s, no peer entry", ip)
-		return
-	}
-	s := n.getSubnetforIPAddr(ip)
-	if s == nil {
-		logrus.Errorf("Reprogramming on L3 miss failed for %s, not a valid subnet", ip)
-		return
-	}
-	sbox := n.sandbox()
-	if sbox == nil {
-		logrus.Errorf("Reprogramming on L3 miss failed for %s, overlay sandbox missing", ip)
-		return
-	}
-	if err := sbox.AddNeighbor(ip, peerMac, true, sbox.NeighborOptions().LinkName(s.vxlanName)); err != nil {
-		logrus.Errorf("Reprogramming on L3 miss failed for %s: %v", ip, err)
-		return
-	}
-}
-
 func (d *driver) addNetwork(n *network) {
 	d.Lock()
 	d.networks[n.id] = n
@@ -1058,7 +1035,7 @@ func (n *network) obtainVxlanID(s *subnet) error {
 		}
 
 		if s.vni == 0 {
-			vxlanID, err := n.driver.vxlanIdm.GetID()
+			vxlanID, err := n.driver.vxlanIdm.GetID(true)
 			if err != nil {
 				return fmt.Errorf("failed to allocate vxlan id: %v", err)
 			}
@@ -1090,15 +1067,6 @@ func (n *network) contains(ip net.IP) bool {
 	return false
 }
 
-func (n *network) getSubnetforIPAddr(ip net.IP) *subnet {
-	for _, s := range n.subnets {
-		if s.subnetIP.Contains(ip) {
-			return s
-		}
-	}
-	return nil
-}
-
 // getSubnetforIP returns the subnet to which the given IP belongs
 func (n *network) getSubnetforIP(ip *net.IPNet) *subnet {
 	for _, s := range n.subnets {

+ 4 - 4
vendor/github.com/docker/libnetwork/drivers/overlay/ov_serf.go

@@ -122,7 +122,7 @@ func (d *driver) processEvent(u serf.UserEvent) {
 	case "join":
 		d.peerAdd(nid, eid, net.ParseIP(ipStr), net.IPMask(net.ParseIP(maskStr).To4()), mac, net.ParseIP(vtepStr), false, false, false)
 	case "leave":
-		d.peerDelete(nid, eid, net.ParseIP(ipStr), net.IPMask(net.ParseIP(maskStr).To4()), mac, net.ParseIP(vtepStr))
+		d.peerDelete(nid, eid, net.ParseIP(ipStr), net.IPMask(net.ParseIP(maskStr).To4()), mac, net.ParseIP(vtepStr), false)
 	}
 }
 
@@ -135,13 +135,13 @@ func (d *driver) processQuery(q *serf.Query) {
 		fmt.Printf("Failed to scan query payload string: %v\n", err)
 	}
 
-	peerMac, peerIPMask, vtep, err := d.peerDbSearch(nid, net.ParseIP(ipStr))
+	pKey, pEntry, err := d.peerDbSearch(nid, net.ParseIP(ipStr))
 	if err != nil {
 		return
 	}
 
-	logrus.Debugf("Sending peer query resp mac %s, mask %s, vtep %s", peerMac, net.IP(peerIPMask), vtep)
-	q.Respond([]byte(fmt.Sprintf("%s %s %s", peerMac.String(), net.IP(peerIPMask).String(), vtep.String())))
+	logrus.Debugf("Sending peer query resp mac %v, mask %s, vtep %s", pKey.peerMac, net.IP(pEntry.peerIPMask).String(), pEntry.vtep)
+	q.Respond([]byte(fmt.Sprintf("%s %s %s", pKey.peerMac.String(), net.IP(pEntry.peerIPMask).String(), pEntry.vtep.String())))
 }
 
 func (d *driver) resolvePeer(nid string, peerIP net.IP) (net.HardwareAddr, net.IPMask, net.IP, error) {

+ 1 - 1
vendor/github.com/docker/libnetwork/drivers/overlay/overlay.go

@@ -262,7 +262,7 @@ func (d *driver) nodeJoin(advertiseAddress, bindAddress string, self bool) {
 		d.Unlock()
 
 		// If containers are already running on this network update the
-		// advertiseaddress in the peerDB
+		// advertise address in the peerDB
 		d.localJoinOnce.Do(func() {
 			d.peerDBUpdateSelf()
 		})

+ 1 - 1
vendor/github.com/docker/libnetwork/drivers/overlay/overlay.proto

@@ -24,4 +24,4 @@ message PeerRecord {
 	// which this container is running and can be reached by
 	// building a tunnel to that host IP.
 	string tunnel_endpoint_ip = 3 [(gogoproto.customname) = "TunnelEndpointIP"];
-}
+}

+ 1 - 1
vendor/github.com/docker/libnetwork/drivers/overlay/ovmanager/ovmanager.go

@@ -165,7 +165,7 @@ func (n *network) obtainVxlanID(s *subnet) error {
 	n.Unlock()
 
 	if vni == 0 {
-		vni, err = n.driver.vxlanIdm.GetIDInRange(vxlanIDStart, vxlanIDEnd)
+		vni, err = n.driver.vxlanIdm.GetIDInRange(vxlanIDStart, vxlanIDEnd, true)
 		if err != nil {
 			return err
 		}

+ 158 - 78
vendor/github.com/docker/libnetwork/drivers/overlay/peerdb.go

@@ -8,6 +8,7 @@ import (
 	"syscall"
 
 	"github.com/docker/libnetwork/common"
+	"github.com/docker/libnetwork/osl"
 	"github.com/sirupsen/logrus"
 )
 
@@ -22,16 +23,48 @@ type peerEntry struct {
 	eid        string
 	vtep       net.IP
 	peerIPMask net.IPMask
-	inSandbox  bool
 	isLocal    bool
 }
 
+func (p *peerEntry) MarshalDB() peerEntryDB {
+	ones, bits := p.peerIPMask.Size()
+	return peerEntryDB{
+		eid:            p.eid,
+		vtep:           p.vtep.String(),
+		peerIPMaskOnes: ones,
+		peerIPMaskBits: bits,
+		isLocal:        p.isLocal,
+	}
+}
+
+// This the structure saved into the set (SetMatrix), due to the implementation of it
+// the value inserted in the set has to be Hashable so the []byte had to be converted into
+// strings
+type peerEntryDB struct {
+	eid            string
+	vtep           string
+	peerIPMaskOnes int
+	peerIPMaskBits int
+	isLocal        bool
+}
+
+func (p *peerEntryDB) UnMarshalDB() peerEntry {
+	return peerEntry{
+		eid:        p.eid,
+		vtep:       net.ParseIP(p.vtep),
+		peerIPMask: net.CIDRMask(p.peerIPMaskOnes, p.peerIPMaskBits),
+		isLocal:    p.isLocal,
+	}
+}
+
 type peerMap struct {
-	mp map[string]peerEntry
+	// set of peerEntry, note they have to be objects and not pointers to maintain the proper equality checks
+	mp common.SetMatrix
 	sync.Mutex
 }
 
 type peerNetworkMap struct {
+	// map with key peerKey
 	mp map[string]*peerMap
 	sync.Mutex
 }
@@ -54,11 +87,7 @@ func (pKey *peerKey) Scan(state fmt.ScanState, verb rune) error {
 	}
 
 	pKey.peerMac, err = net.ParseMAC(string(macB))
-	if err != nil {
-		return err
-	}
-
-	return nil
+	return err
 }
 
 func (d *driver) peerDbWalk(f func(string, *peerKey, *peerEntry) bool) error {
@@ -87,10 +116,13 @@ func (d *driver) peerDbNetworkWalk(nid string, f func(*peerKey, *peerEntry) bool
 	}
 
 	mp := map[string]peerEntry{}
-
 	pMap.Lock()
-	for pKeyStr, pEntry := range pMap.mp {
-		mp[pKeyStr] = pEntry
+	for _, pKeyStr := range pMap.mp.Keys() {
+		entryDBList, ok := pMap.mp.Get(pKeyStr)
+		if ok {
+			peerEntryDB := entryDBList[0].(peerEntryDB)
+			mp[pKeyStr] = peerEntryDB.UnMarshalDB()
+		}
 	}
 	pMap.Unlock()
 
@@ -107,45 +139,38 @@ func (d *driver) peerDbNetworkWalk(nid string, f func(*peerKey, *peerEntry) bool
 	return nil
 }
 
-func (d *driver) peerDbSearch(nid string, peerIP net.IP) (net.HardwareAddr, net.IPMask, net.IP, error) {
-	var (
-		peerMac    net.HardwareAddr
-		vtep       net.IP
-		peerIPMask net.IPMask
-		found      bool
-	)
-
+func (d *driver) peerDbSearch(nid string, peerIP net.IP) (*peerKey, *peerEntry, error) {
+	var pKeyMatched *peerKey
+	var pEntryMatched *peerEntry
 	err := d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool {
 		if pKey.peerIP.Equal(peerIP) {
-			peerMac = pKey.peerMac
-			peerIPMask = pEntry.peerIPMask
-			vtep = pEntry.vtep
-			found = true
-			return found
+			pKeyMatched = pKey
+			pEntryMatched = pEntry
+			return true
 		}
 
-		return found
+		return false
 	})
 
 	if err != nil {
-		return nil, nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err)
+		return nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err)
 	}
 
-	if !found {
-		return nil, nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP)
+	if pKeyMatched == nil || pEntryMatched == nil {
+		return nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP)
 	}
 
-	return peerMac, peerIPMask, vtep, nil
+	return pKeyMatched, pEntryMatched, nil
 }
 
 func (d *driver) peerDbAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
-	peerMac net.HardwareAddr, vtep net.IP, isLocal bool) {
+	peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) {
 
 	d.peerDb.Lock()
 	pMap, ok := d.peerDb.mp[nid]
 	if !ok {
 		d.peerDb.mp[nid] = &peerMap{
-			mp: make(map[string]peerEntry),
+			mp: common.NewSetMatrix(),
 		}
 
 		pMap = d.peerDb.mp[nid]
@@ -165,18 +190,24 @@ func (d *driver) peerDbAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask
 	}
 
 	pMap.Lock()
-	pMap.mp[pKey.String()] = pEntry
-	pMap.Unlock()
+	defer pMap.Unlock()
+	b, i := pMap.mp.Insert(pKey.String(), pEntry.MarshalDB())
+	if i != 1 {
+		// Transient case, there is more than one endpoint that is using the same IP,MAC pair
+		s, _ := pMap.mp.String(pKey.String())
+		logrus.Warnf("peerDbAdd transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s)
+	}
+	return b, i
 }
 
 func (d *driver) peerDbDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
-	peerMac net.HardwareAddr, vtep net.IP) peerEntry {
+	peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) {
 
 	d.peerDb.Lock()
 	pMap, ok := d.peerDb.mp[nid]
 	if !ok {
 		d.peerDb.Unlock()
-		return peerEntry{}
+		return false, 0
 	}
 	d.peerDb.Unlock()
 
@@ -185,22 +216,22 @@ func (d *driver) peerDbDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPM
 		peerMac: peerMac,
 	}
 
-	pMap.Lock()
-
-	pEntry, ok := pMap.mp[pKey.String()]
-	if ok {
-		// Mismatched endpoint ID(possibly outdated). Do not
-		// delete peerdb
-		if pEntry.eid != eid {
-			pMap.Unlock()
-			return pEntry
-		}
+	pEntry := peerEntry{
+		eid:        eid,
+		vtep:       vtep,
+		peerIPMask: peerIPMask,
+		isLocal:    isLocal,
 	}
 
-	delete(pMap.mp, pKey.String())
-	pMap.Unlock()
-
-	return pEntry
+	pMap.Lock()
+	defer pMap.Unlock()
+	b, i := pMap.mp.Remove(pKey.String(), pEntry.MarshalDB())
+	if i != 0 {
+		// Transient case, there is more than one endpoint that is using the same IP,MAC pair
+		s, _ := pMap.mp.String(pKey.String())
+		logrus.Warnf("peerDbDelete transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s)
+	}
+	return b, i
 }
 
 // The overlay uses a lazy initialization approach, this means that when a network is created
@@ -224,6 +255,7 @@ const (
 	peerOperationINIT peerOperationType = iota
 	peerOperationADD
 	peerOperationDELETE
+	peerOperationFLUSH
 )
 
 type peerOperation struct {
@@ -253,7 +285,9 @@ func (d *driver) peerOpRoutine(ctx context.Context, ch chan *peerOperation) {
 			case peerOperationADD:
 				err = d.peerAddOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.l2Miss, op.l3Miss, true, op.localPeer)
 			case peerOperationDELETE:
-				err = d.peerDeleteOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP)
+				err = d.peerDeleteOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.localPeer)
+			case peerOperationFLUSH:
+				err = d.peerFlushOp(op.networkID)
 			}
 			if err != nil {
 				logrus.Warnf("Peer operation failed:%s op:%v", err, op)
@@ -286,7 +320,6 @@ func (d *driver) peerInitOp(nid string) error {
 
 func (d *driver) peerAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
 	peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, localPeer bool) {
-	callerName := common.CallerName(1)
 	d.peerOpCh <- &peerOperation{
 		opType:     peerOperationADD,
 		networkID:  nid,
@@ -298,24 +331,32 @@ func (d *driver) peerAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
 		l2Miss:     l2Miss,
 		l3Miss:     l3Miss,
 		localPeer:  localPeer,
-		callerName: callerName,
+		callerName: common.CallerName(1),
 	}
 }
 
 func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
-	peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, updateDB, updateOnlyDB bool) error {
+	peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, updateDB, localPeer bool) error {
 
 	if err := validateID(nid, eid); err != nil {
 		return err
 	}
 
+	var dbEntries int
+	var inserted bool
 	if updateDB {
-		d.peerDbAdd(nid, eid, peerIP, peerIPMask, peerMac, vtep, false)
-		if updateOnlyDB {
-			return nil
+		inserted, dbEntries = d.peerDbAdd(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer)
+		if !inserted {
+			logrus.Warnf("Entry already present in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v",
+				nid, eid, peerIP, peerMac, localPeer, vtep)
 		}
 	}
 
+	// Local peers do not need any further configuration
+	if localPeer {
+		return nil
+	}
+
 	n := d.network(nid)
 	if n == nil {
 		return nil
@@ -353,21 +394,26 @@ func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask
 
 	// Add neighbor entry for the peer IP
 	if err := sbox.AddNeighbor(peerIP, peerMac, l3Miss, sbox.NeighborOptions().LinkName(s.vxlanName)); err != nil {
-		return fmt.Errorf("could not add neighbor entry into the sandbox: %v", err)
+		if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 1 {
+			// We are in the transient case so only the first configuration is programmed into the kernel
+			// Upon deletion if the active configuration is deleted the next one from the database will be restored
+			// Note we are skipping also the next configuration
+			return nil
+		}
+		return fmt.Errorf("could not add neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
 	}
 
 	// Add fdb entry to the bridge for the peer mac
 	if err := sbox.AddNeighbor(vtep, peerMac, l2Miss, sbox.NeighborOptions().LinkName(s.vxlanName),
 		sbox.NeighborOptions().Family(syscall.AF_BRIDGE)); err != nil {
-		return fmt.Errorf("could not add fdb entry into the sandbox: %v", err)
+		return fmt.Errorf("could not add fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
 	}
 
 	return nil
 }
 
 func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
-	peerMac net.HardwareAddr, vtep net.IP) {
-	callerName := common.CallerName(1)
+	peerMac net.HardwareAddr, vtep net.IP, localPeer bool) {
 	d.peerOpCh <- &peerOperation{
 		opType:     peerOperationDELETE,
 		networkID:  nid,
@@ -376,18 +422,23 @@ func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMas
 		peerIPMask: peerIPMask,
 		peerMac:    peerMac,
 		vtepIP:     vtep,
-		callerName: callerName,
+		callerName: common.CallerName(1),
+		localPeer:  localPeer,
 	}
 }
 
 func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
-	peerMac net.HardwareAddr, vtep net.IP) error {
+	peerMac net.HardwareAddr, vtep net.IP, localPeer bool) error {
 
 	if err := validateID(nid, eid); err != nil {
 		return err
 	}
 
-	pEntry := d.peerDbDelete(nid, eid, peerIP, peerIPMask, peerMac, vtep)
+	deleted, dbEntries := d.peerDbDelete(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer)
+	if !deleted {
+		logrus.Warnf("Entry was not in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v",
+			nid, eid, peerIP, peerMac, localPeer, vtep)
+	}
 
 	n := d.network(nid)
 	if n == nil {
@@ -399,30 +450,59 @@ func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPM
 		return nil
 	}
 
-	// Delete fdb entry to the bridge for the peer mac only if the
-	// entry existed in local peerdb. If it is a stale delete
-	// request, still call DeleteNeighbor but only to cleanup any
-	// leftover sandbox neighbor cache and not actually delete the
-	// kernel state.
-	if (eid == pEntry.eid && vtep.Equal(pEntry.vtep)) ||
-		(eid != pEntry.eid && !vtep.Equal(pEntry.vtep)) {
-		if err := sbox.DeleteNeighbor(vtep, peerMac,
-			eid == pEntry.eid && vtep.Equal(pEntry.vtep)); err != nil {
-			return fmt.Errorf("could not delete fdb entry into the sandbox: %v", err)
-		}
+	if err := d.checkEncryption(nid, vtep, 0, localPeer, false); err != nil {
+		logrus.Warn(err)
 	}
 
-	// Delete neighbor entry for the peer IP
-	if eid == pEntry.eid {
+	// Local peers do not have any local configuration to delete
+	if !localPeer {
+		// Remove fdb entry to the bridge for the peer mac
+		if err := sbox.DeleteNeighbor(vtep, peerMac, true); err != nil {
+			if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 0 {
+				// We fall in here if there is a transient state and if the neighbor that is being deleted
+				// was never been configured into the kernel (we allow only 1 configuration at the time per <ip,mac> mapping)
+				return nil
+			}
+			return fmt.Errorf("could not delete fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
+		}
+
+		// Delete neighbor entry for the peer IP
 		if err := sbox.DeleteNeighbor(peerIP, peerMac, true); err != nil {
-			return fmt.Errorf("could not delete neighbor entry into the sandbox: %v", err)
+			return fmt.Errorf("could not delete neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
 		}
 	}
 
-	if err := d.checkEncryption(nid, vtep, 0, false, false); err != nil {
-		logrus.Warn(err)
+	if dbEntries == 0 {
+		return nil
+	}
+
+	// If there is still an entry into the database and the deletion went through without errors means that there is now no
+	// configuration active in the kernel.
+	// Restore one configuration for the <ip,mac> directly from the database, note that is guaranteed that there is one
+	peerKey, peerEntry, err := d.peerDbSearch(nid, peerIP)
+	if err != nil {
+		logrus.Errorf("peerDeleteOp unable to restore a configuration for nid:%s ip:%v mac:%v err:%s", nid, peerIP, peerMac, err)
+		return err
 	}
+	return d.peerAddOp(nid, peerEntry.eid, peerIP, peerEntry.peerIPMask, peerKey.peerMac, peerEntry.vtep, false, false, false, peerEntry.isLocal)
+}
 
+func (d *driver) peerFlush(nid string) {
+	d.peerOpCh <- &peerOperation{
+		opType:     peerOperationFLUSH,
+		networkID:  nid,
+		callerName: common.CallerName(1),
+	}
+}
+
+func (d *driver) peerFlushOp(nid string) error {
+	d.peerDb.Lock()
+	defer d.peerDb.Unlock()
+	_, ok := d.peerDb.mp[nid]
+	if !ok {
+		return fmt.Errorf("Unable to find the peerDB for nid:%s", nid)
+	}
+	delete(d.peerDb.mp, nid)
 	return nil
 }
 

+ 1 - 1
vendor/github.com/docker/libnetwork/drivers/solaris/overlay/ov_network.go

@@ -718,7 +718,7 @@ func (n *network) obtainVxlanID(s *subnet) error {
 		}
 
 		if s.vni == 0 {
-			vxlanID, err := n.driver.vxlanIdm.GetID()
+			vxlanID, err := n.driver.vxlanIdm.GetID(true)
 			if err != nil {
 				return fmt.Errorf("failed to allocate vxlan id: %v", err)
 			}

+ 28 - 8
vendor/github.com/docker/libnetwork/drivers/windows/windows.go

@@ -62,10 +62,17 @@ type EndpointConnectivity struct {
 }
 
 type hnsEndpoint struct {
-	id             string
-	nid            string
-	profileID      string
-	Type           string
+	id        string
+	nid       string
+	profileID string
+	Type      string
+	//Note: Currently, the sandboxID is the same as the containerID since windows does
+	//not expose the sandboxID.
+	//In the future, windows will support a proper sandboxID that is different
+	//than the containerID.
+	//Therefore, we are using sandboxID now, so that we won't have to change this code
+	//when windows properly supports a sandboxID.
+	sandboxID      string
 	macAddress     net.HardwareAddr
 	epOption       *endpointOption       // User specified parameters
 	epConnectivity *EndpointConnectivity // User specified parameters
@@ -730,7 +737,15 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
 		return err
 	}
 
-	// This is just a stub for now
+	endpoint.sandboxID = sboxKey
+
+	err = hcsshim.HotAttachEndpoint(endpoint.sandboxID, endpoint.profileID)
+	if err != nil {
+		// If container doesn't exists in hcs, do not throw error for hot add/remove
+		if err != hcsshim.ErrComputeSystemDoesNotExist {
+			return err
+		}
+	}
 
 	jinfo.DisableGatewayService()
 	return nil
@@ -744,13 +759,18 @@ func (d *driver) Leave(nid, eid string) error {
 	}
 
 	// Ensure that the endpoint exists
-	_, err = network.getEndpoint(eid)
+	endpoint, err := network.getEndpoint(eid)
 	if err != nil {
 		return err
 	}
 
-	// This is just a stub for now
-
+	err = hcsshim.HotDetachEndpoint(endpoint.sandboxID, endpoint.profileID)
+	if err != nil {
+		// If container doesn't exists in hcs, do not throw error for hot add/remove
+		if err != hcsshim.ErrComputeSystemDoesNotExist {
+			return err
+		}
+	}
 	return nil
 }
 

+ 4 - 4
vendor/github.com/docker/libnetwork/idm/idm.go

@@ -34,11 +34,11 @@ func New(ds datastore.DataStore, id string, start, end uint64) (*Idm, error) {
 }
 
 // GetID returns the first available id in the set
-func (i *Idm) GetID() (uint64, error) {
+func (i *Idm) GetID(serial bool) (uint64, error) {
 	if i.handle == nil {
 		return 0, errors.New("ID set is not initialized")
 	}
-	ordinal, err := i.handle.SetAny()
+	ordinal, err := i.handle.SetAny(serial)
 	return i.start + ordinal, err
 }
 
@@ -56,7 +56,7 @@ func (i *Idm) GetSpecificID(id uint64) error {
 }
 
 // GetIDInRange returns the first available id in the set within a [start,end] range
-func (i *Idm) GetIDInRange(start, end uint64) (uint64, error) {
+func (i *Idm) GetIDInRange(start, end uint64, serial bool) (uint64, error) {
 	if i.handle == nil {
 		return 0, errors.New("ID set is not initialized")
 	}
@@ -65,7 +65,7 @@ func (i *Idm) GetIDInRange(start, end uint64) (uint64, error) {
 		return 0, errors.New("Requested range does not belong to the set")
 	}
 
-	ordinal, err := i.handle.SetAnyInRange(start-i.start, end-i.start)
+	ordinal, err := i.handle.SetAnyInRange(start-i.start, end-i.start, serial)
 
 	return i.start + ordinal, err
 }

+ 12 - 4
vendor/github.com/docker/libnetwork/ipam/allocator.go

@@ -457,7 +457,15 @@ func (a *Allocator) RequestAddress(poolID string, prefAddress net.IP, opts map[s
 		return nil, nil, types.InternalErrorf("could not find bitmask in datastore for %s on address %v request from pool %s: %v",
 			k.String(), prefAddress, poolID, err)
 	}
-	ip, err := a.getAddress(p.Pool, bm, prefAddress, p.Range)
+	// In order to request for a serial ip address allocation, callers can pass in the option to request
+	// IP allocation serially or first available IP in the subnet
+	var serial bool
+	if opts != nil {
+		if val, ok := opts[ipamapi.AllocSerialPrefix]; ok {
+			serial = (val == "true")
+		}
+	}
+	ip, err := a.getAddress(p.Pool, bm, prefAddress, p.Range, serial)
 	if err != nil {
 		return nil, nil, err
 	}
@@ -522,7 +530,7 @@ func (a *Allocator) ReleaseAddress(poolID string, address net.IP) error {
 	return bm.Unset(ipToUint64(h))
 }
 
-func (a *Allocator) getAddress(nw *net.IPNet, bitmask *bitseq.Handle, prefAddress net.IP, ipr *AddressRange) (net.IP, error) {
+func (a *Allocator) getAddress(nw *net.IPNet, bitmask *bitseq.Handle, prefAddress net.IP, ipr *AddressRange, serial bool) (net.IP, error) {
 	var (
 		ordinal uint64
 		err     error
@@ -535,7 +543,7 @@ func (a *Allocator) getAddress(nw *net.IPNet, bitmask *bitseq.Handle, prefAddres
 		return nil, ipamapi.ErrNoAvailableIPs
 	}
 	if ipr == nil && prefAddress == nil {
-		ordinal, err = bitmask.SetAny()
+		ordinal, err = bitmask.SetAny(serial)
 	} else if prefAddress != nil {
 		hostPart, e := types.GetHostPartIP(prefAddress, base.Mask)
 		if e != nil {
@@ -544,7 +552,7 @@ func (a *Allocator) getAddress(nw *net.IPNet, bitmask *bitseq.Handle, prefAddres
 		ordinal = ipToUint64(types.GetMinimalIP(hostPart))
 		err = bitmask.Set(ordinal)
 	} else {
-		ordinal, err = bitmask.SetAnyInRange(ipr.Start, ipr.End)
+		ordinal, err = bitmask.SetAnyInRange(ipr.Start, ipr.End, serial)
 	}
 
 	switch err {

+ 10 - 0
vendor/github.com/docker/libnetwork/ipamapi/labels.go

@@ -0,0 +1,10 @@
+package ipamapi
+
+const (
+	// Prefix constant marks the reserved label space for libnetwork
+	Prefix = "com.docker.network"
+
+	// AllocSerialPrefix constant marks the reserved label space for libnetwork ipam
+	// allocation ordering.(serial/first available)
+	AllocSerialPrefix = Prefix + ".ipam.serial"
+)

+ 1 - 1
vendor/github.com/docker/libnetwork/iptables/iptables.go

@@ -456,7 +456,7 @@ func RawCombinedOutputNative(args ...string) error {
 
 // ExistChain checks if a chain exists
 func ExistChain(chain string, table Table) bool {
-	if _, err := Raw("-t", string(table), "-L", chain); err == nil {
+	if _, err := Raw("-t", string(table), "-nL", chain); err == nil {
 		return true
 	}
 	return false

+ 6 - 6
vendor/github.com/docker/libnetwork/networkdb/broadcast.go

@@ -32,7 +32,7 @@ func (nDB *NetworkDB) sendNetworkEvent(nid string, event NetworkEvent_Type, ltim
 	nEvent := NetworkEvent{
 		Type:      event,
 		LTime:     ltime,
-		NodeName:  nDB.config.NodeName,
+		NodeName:  nDB.config.NodeID,
 		NetworkID: nid,
 	}
 
@@ -44,7 +44,7 @@ func (nDB *NetworkDB) sendNetworkEvent(nid string, event NetworkEvent_Type, ltim
 	nDB.networkBroadcasts.QueueBroadcast(&networkEventMessage{
 		msg:  raw,
 		id:   nid,
-		node: nDB.config.NodeName,
+		node: nDB.config.NodeID,
 	})
 	return nil
 }
@@ -72,7 +72,7 @@ func (nDB *NetworkDB) sendNodeEvent(event NodeEvent_Type) error {
 	nEvent := NodeEvent{
 		Type:     event,
 		LTime:    nDB.networkClock.Increment(),
-		NodeName: nDB.config.NodeName,
+		NodeName: nDB.config.NodeID,
 	}
 
 	raw, err := encodeMessage(MessageTypeNodeEvent, &nEvent)
@@ -129,7 +129,7 @@ func (nDB *NetworkDB) sendTableEvent(event TableEvent_Type, nid string, tname st
 	tEvent := TableEvent{
 		Type:      event,
 		LTime:     entry.ltime,
-		NodeName:  nDB.config.NodeName,
+		NodeName:  nDB.config.NodeID,
 		NetworkID: nid,
 		TableName: tname,
 		Key:       key,
@@ -145,7 +145,7 @@ func (nDB *NetworkDB) sendTableEvent(event TableEvent_Type, nid string, tname st
 
 	var broadcastQ *memberlist.TransmitLimitedQueue
 	nDB.RLock()
-	thisNodeNetworks, ok := nDB.networks[nDB.config.NodeName]
+	thisNodeNetworks, ok := nDB.networks[nDB.config.NodeID]
 	if ok {
 		// The network may have been removed
 		network, networkOk := thisNodeNetworks[nid]
@@ -168,7 +168,7 @@ func (nDB *NetworkDB) sendTableEvent(event TableEvent_Type, nid string, tname st
 		id:    nid,
 		tname: tname,
 		key:   key,
-		node:  nDB.config.NodeName,
+		node:  nDB.config.NodeID,
 	})
 	return nil
 }

+ 14 - 12
vendor/github.com/docker/libnetwork/networkdb/cluster.go

@@ -106,7 +106,7 @@ func (nDB *NetworkDB) clusterInit() error {
 	nDB.lastHealthTimestamp = nDB.lastStatsTimestamp
 
 	config := memberlist.DefaultLANConfig()
-	config.Name = nDB.config.NodeName
+	config.Name = nDB.config.NodeID
 	config.BindAddr = nDB.config.BindAddr
 	config.AdvertiseAddr = nDB.config.AdvertiseAddr
 	config.UDPBufferSize = nDB.config.PacketBufferSize
@@ -329,7 +329,7 @@ func (nDB *NetworkDB) reapTableEntries() {
 	var nodeNetworks []string
 	// This is best effort, if the list of network changes will be picked up in the next cycle
 	nDB.RLock()
-	for nid := range nDB.networks[nDB.config.NodeName] {
+	for nid := range nDB.networks[nDB.config.NodeID] {
 		nodeNetworks = append(nodeNetworks, nid)
 	}
 	nDB.RUnlock()
@@ -376,7 +376,7 @@ func (nDB *NetworkDB) reapTableEntries() {
 func (nDB *NetworkDB) gossip() {
 	networkNodes := make(map[string][]string)
 	nDB.RLock()
-	thisNodeNetworks := nDB.networks[nDB.config.NodeName]
+	thisNodeNetworks := nDB.networks[nDB.config.NodeID]
 	for nid := range thisNodeNetworks {
 		networkNodes[nid] = nDB.networkNodes[nid]
 
@@ -388,7 +388,7 @@ func (nDB *NetworkDB) gossip() {
 	if printHealth {
 		healthScore := nDB.memberlist.GetHealthScore()
 		if healthScore != 0 {
-			logrus.Warnf("NetworkDB stats - healthscore:%d (connectivity issues)", healthScore)
+			logrus.Warnf("NetworkDB stats %v(%v) - healthscore:%d (connectivity issues)", nDB.config.Hostname, nDB.config.NodeID, healthScore)
 		}
 		nDB.lastHealthTimestamp = time.Now()
 	}
@@ -419,7 +419,8 @@ func (nDB *NetworkDB) gossip() {
 		// Collect stats and print the queue info, note this code is here also to have a view of the queues empty
 		network.qMessagesSent += len(msgs)
 		if printStats {
-			logrus.Infof("NetworkDB stats - netID:%s leaving:%t netPeers:%d entries:%d Queue qLen:%d netMsg/s:%d",
+			logrus.Infof("NetworkDB stats %v(%v) - netID:%s leaving:%t netPeers:%d entries:%d Queue qLen:%d netMsg/s:%d",
+				nDB.config.Hostname, nDB.config.NodeID,
 				nid, network.leaving, broadcastQ.NumNodes(), network.entriesNumber, broadcastQ.NumQueued(),
 				network.qMessagesSent/int((nDB.config.StatsPrintPeriod/time.Second)))
 			network.qMessagesSent = 0
@@ -456,7 +457,7 @@ func (nDB *NetworkDB) gossip() {
 func (nDB *NetworkDB) bulkSyncTables() {
 	var networks []string
 	nDB.RLock()
-	for nid, network := range nDB.networks[nDB.config.NodeName] {
+	for nid, network := range nDB.networks[nDB.config.NodeID] {
 		if network.leaving {
 			continue
 		}
@@ -522,10 +523,10 @@ func (nDB *NetworkDB) bulkSync(nodes []string, all bool) ([]string, error) {
 	var err error
 	var networks []string
 	for _, node := range nodes {
-		if node == nDB.config.NodeName {
+		if node == nDB.config.NodeID {
 			continue
 		}
-		logrus.Debugf("%s: Initiating bulk sync with node %v", nDB.config.NodeName, node)
+		logrus.Debugf("%v(%v): Initiating bulk sync with node %v", nDB.config.Hostname, nDB.config.NodeID, node)
 		networks = nDB.findCommonNetworks(node)
 		err = nDB.bulkSyncNode(networks, node, true)
 		// if its periodic bulksync stop after the first successful sync
@@ -556,7 +557,8 @@ func (nDB *NetworkDB) bulkSyncNode(networks []string, node string, unsolicited b
 		unsolMsg = "unsolicited"
 	}
 
-	logrus.Debugf("%s: Initiating %s bulk sync for networks %v with node %s", nDB.config.NodeName, unsolMsg, networks, node)
+	logrus.Debugf("%v(%v): Initiating %s bulk sync for networks %v with node %s",
+		nDB.config.Hostname, nDB.config.NodeID, unsolMsg, networks, node)
 
 	nDB.RLock()
 	mnode := nDB.nodes[node]
@@ -608,7 +610,7 @@ func (nDB *NetworkDB) bulkSyncNode(networks []string, node string, unsolicited b
 	bsm := BulkSyncMessage{
 		LTime:       nDB.tableClock.Time(),
 		Unsolicited: unsolicited,
-		NodeName:    nDB.config.NodeName,
+		NodeName:    nDB.config.NodeID,
 		Networks:    networks,
 		Payload:     compound,
 	}
@@ -640,7 +642,7 @@ func (nDB *NetworkDB) bulkSyncNode(networks []string, node string, unsolicited b
 		case <-t.C:
 			logrus.Errorf("Bulk sync to node %s timed out", node)
 		case <-ch:
-			logrus.Debugf("%s: Bulk sync to node %s took %s", nDB.config.NodeName, node, time.Since(startTime))
+			logrus.Debugf("%v(%v): Bulk sync to node %s took %s", nDB.config.Hostname, nDB.config.NodeID, node, time.Since(startTime))
 		}
 		t.Stop()
 	}
@@ -677,7 +679,7 @@ OUTER:
 		idx := randomOffset(n)
 		node := nodes[idx]
 
-		if node == nDB.config.NodeName {
+		if node == nDB.config.NodeID {
 			continue
 		}
 

+ 8 - 32
vendor/github.com/docker/libnetwork/networkdb/delegate.go

@@ -2,7 +2,6 @@ package networkdb
 
 import (
 	"net"
-	"strings"
 	"time"
 
 	"github.com/gogo/protobuf/proto"
@@ -58,29 +57,6 @@ func (nDB *NetworkDB) checkAndGetNode(nEvent *NodeEvent) *node {
 	return nil
 }
 
-func (nDB *NetworkDB) purgeSameNode(n *node) {
-	nDB.Lock()
-	defer nDB.Unlock()
-
-	prefix := strings.Split(n.Name, "-")[0]
-	for _, nodes := range []map[string]*node{
-		nDB.failedNodes,
-		nDB.leftNodes,
-		nDB.nodes,
-	} {
-		var nodeNames []string
-		for name, node := range nodes {
-			if strings.HasPrefix(name, prefix) && n.Addr.Equal(node.Addr) {
-				nodeNames = append(nodeNames, name)
-			}
-		}
-
-		for _, name := range nodeNames {
-			delete(nodes, name)
-		}
-	}
-}
-
 func (nDB *NetworkDB) handleNodeEvent(nEvent *NodeEvent) bool {
 	// Update our local clock if the received messages has newer
 	// time.
@@ -108,7 +84,6 @@ func (nDB *NetworkDB) handleNodeEvent(nEvent *NodeEvent) bool {
 		return false
 	}
 
-	nDB.purgeSameNode(n)
 	n.ltime = nEvent.LTime
 
 	switch nEvent.Type {
@@ -140,7 +115,7 @@ func (nDB *NetworkDB) handleNetworkEvent(nEvent *NetworkEvent) bool {
 	nDB.Lock()
 	defer nDB.Unlock()
 
-	if nEvent.NodeName == nDB.config.NodeName {
+	if nEvent.NodeName == nDB.config.NodeID {
 		return false
 	}
 
@@ -203,7 +178,7 @@ func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent) bool {
 
 	// Ignore the table events for networks that are in the process of going away
 	nDB.RLock()
-	networks := nDB.networks[nDB.config.NodeName]
+	networks := nDB.networks[nDB.config.NodeID]
 	network, ok := networks[tEvent.NetworkID]
 	// Check if the owner of the event is still part of the network
 	nodes := nDB.networkNodes[tEvent.NetworkID]
@@ -253,7 +228,8 @@ func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent) bool {
 		// If it is a delete event and we did not have a state for it, don't propagate to the application
 		// If the residual reapTime is lower or equal to 1/6 of the total reapTime don't bother broadcasting it around
 		// most likely the cluster is already aware of it, if not who will sync with this node will catch the state too.
-		return e.reapTime > reapPeriod/6
+		// This also avoids that deletion of entries close to their garbage collection ends up circuling around forever
+		return e.reapTime > reapEntryInterval/6
 	}
 
 	var op opType
@@ -292,7 +268,7 @@ func (nDB *NetworkDB) handleTableMessage(buf []byte, isBulkSync bool) {
 	}
 
 	// Ignore messages that this node generated.
-	if tEvent.NodeName == nDB.config.NodeName {
+	if tEvent.NodeName == nDB.config.NodeID {
 		return
 	}
 
@@ -305,7 +281,7 @@ func (nDB *NetworkDB) handleTableMessage(buf []byte, isBulkSync bool) {
 		}
 
 		nDB.RLock()
-		n, ok := nDB.networks[nDB.config.NodeName][tEvent.NetworkID]
+		n, ok := nDB.networks[nDB.config.NodeID][tEvent.NetworkID]
 		nDB.RUnlock()
 
 		// if the network is not there anymore, OR we are leaving the network OR the broadcast queue is not present
@@ -424,7 +400,7 @@ func (nDB *NetworkDB) handleMessage(buf []byte, isBulkSync bool) {
 	case MessageTypeCompound:
 		nDB.handleCompound(data, isBulkSync)
 	default:
-		logrus.Errorf("%s: unknown message type %d", nDB.config.NodeName, mType)
+		logrus.Errorf("%v(%v): unknown message type %d", nDB.config.Hostname, nDB.config.NodeID, mType)
 	}
 }
 
@@ -457,7 +433,7 @@ func (d *delegate) LocalState(join bool) []byte {
 
 	pp := NetworkPushPull{
 		LTime:    d.nDB.networkClock.Time(),
-		NodeName: d.nDB.config.NodeName,
+		NodeName: d.nDB.config.NodeID,
 	}
 
 	for name, nn := range d.nDB.networks {

+ 33 - 22
vendor/github.com/docker/libnetwork/networkdb/networkdb.go

@@ -11,6 +11,7 @@ import (
 	"time"
 
 	"github.com/armon/go-radix"
+	"github.com/docker/docker/pkg/stringid"
 	"github.com/docker/go-events"
 	"github.com/docker/libnetwork/types"
 	"github.com/hashicorp/memberlist"
@@ -151,8 +152,11 @@ type network struct {
 // Config represents the configuration of the networdb instance and
 // can be passed by the caller.
 type Config struct {
-	// NodeName is the cluster wide unique name for this node.
-	NodeName string
+	// NodeID is the node unique identifier of the node when is part of the cluster
+	NodeID string
+
+	// Hostname is the node hostname.
+	Hostname string
 
 	// BindAddr is the IP on which networkdb listens. It can be
 	// 0.0.0.0 to listen on all addresses on the host.
@@ -210,7 +214,8 @@ type entry struct {
 func DefaultConfig() *Config {
 	hostname, _ := os.Hostname()
 	return &Config{
-		NodeName:          hostname,
+		NodeID:            stringid.TruncateID(stringid.GenerateRandomID()),
+		Hostname:          hostname,
 		BindAddr:          "0.0.0.0",
 		PacketBufferSize:  1400,
 		StatsPrintPeriod:  5 * time.Minute,
@@ -236,6 +241,7 @@ func New(c *Config) (*NetworkDB, error) {
 	nDB.indexes[byTable] = radix.New()
 	nDB.indexes[byNetwork] = radix.New()
 
+	logrus.Debugf("New memberlist node - Node:%v will use memberlist nodeID:%v", c.Hostname, c.NodeID)
 	if err := nDB.clusterInit(); err != nil {
 		return nil, err
 	}
@@ -259,8 +265,11 @@ func (nDB *NetworkDB) Join(members []string) error {
 // stopping timers, canceling goroutines etc.
 func (nDB *NetworkDB) Close() {
 	if err := nDB.clusterLeave(); err != nil {
-		logrus.Errorf("Could not close DB %s: %v", nDB.config.NodeName, err)
+		logrus.Errorf("%v(%v) Could not close DB: %v", nDB.config.Hostname, nDB.config.NodeID, err)
 	}
+
+	//Avoid (*Broadcaster).run goroutine leak
+	nDB.broadcaster.Close()
 }
 
 // ClusterPeers returns all the gossip cluster peers.
@@ -334,7 +343,7 @@ func (nDB *NetworkDB) CreateEntry(tname, nid, key string, value []byte) error {
 
 	entry := &entry{
 		ltime: nDB.tableClock.Increment(),
-		node:  nDB.config.NodeName,
+		node:  nDB.config.NodeID,
 		value: value,
 	}
 
@@ -360,7 +369,7 @@ func (nDB *NetworkDB) UpdateEntry(tname, nid, key string, value []byte) error {
 
 	entry := &entry{
 		ltime: nDB.tableClock.Increment(),
-		node:  nDB.config.NodeName,
+		node:  nDB.config.NodeID,
 		value: value,
 	}
 
@@ -402,7 +411,7 @@ func (nDB *NetworkDB) DeleteEntry(tname, nid, key string) error {
 
 	entry := &entry{
 		ltime:    nDB.tableClock.Increment(),
-		node:     nDB.config.NodeName,
+		node:     nDB.config.NodeID,
 		value:    value,
 		deleting: true,
 		reapTime: reapEntryInterval,
@@ -451,7 +460,7 @@ func (nDB *NetworkDB) deleteNetworkEntriesForNode(deletedNode string) {
 //		  entries owned by remote nodes, we will accept them and we notify the application
 func (nDB *NetworkDB) deleteNodeNetworkEntries(nid, node string) {
 	// Indicates if the delete is triggered for the local node
-	isNodeLocal := node == nDB.config.NodeName
+	isNodeLocal := node == nDB.config.NodeID
 
 	nDB.indexes[byNetwork].WalkPrefix(fmt.Sprintf("/%s", nid),
 		func(path string, v interface{}) bool {
@@ -496,7 +505,10 @@ func (nDB *NetworkDB) deleteNodeNetworkEntries(nid, node string) {
 				nDB.deleteEntry(nid, tname, key)
 			}
 
-			nDB.broadcaster.Write(makeEvent(opDelete, tname, nid, key, entry.value))
+			// Notify to the upper layer only entries not already marked for deletion
+			if !oldEntry.deleting {
+				nDB.broadcaster.Write(makeEvent(opDelete, tname, nid, key, entry.value))
+			}
 			return false
 		})
 }
@@ -552,10 +564,10 @@ func (nDB *NetworkDB) JoinNetwork(nid string) error {
 	ltime := nDB.networkClock.Increment()
 
 	nDB.Lock()
-	nodeNetworks, ok := nDB.networks[nDB.config.NodeName]
+	nodeNetworks, ok := nDB.networks[nDB.config.NodeID]
 	if !ok {
 		nodeNetworks = make(map[string]*network)
-		nDB.networks[nDB.config.NodeName] = nodeNetworks
+		nDB.networks[nDB.config.NodeID] = nodeNetworks
 	}
 	n, ok := nodeNetworks[nid]
 	var entries int
@@ -571,8 +583,7 @@ func (nDB *NetworkDB) JoinNetwork(nid string) error {
 		},
 		RetransmitMult: 4,
 	}
-
-	nDB.addNetworkNode(nid, nDB.config.NodeName)
+	nDB.addNetworkNode(nid, nDB.config.NodeID)
 	networkNodes := nDB.networkNodes[nid]
 	nDB.Unlock()
 
@@ -580,7 +591,7 @@ func (nDB *NetworkDB) JoinNetwork(nid string) error {
 		return fmt.Errorf("failed to send leave network event for %s: %v", nid, err)
 	}
 
-	logrus.Debugf("%s: joined network %s", nDB.config.NodeName, nid)
+	logrus.Debugf("%v(%v): joined network %s", nDB.config.Hostname, nDB.config.NodeID, nid)
 	if _, err := nDB.bulkSync(networkNodes, true); err != nil {
 		logrus.Errorf("Error bulk syncing while joining network %s: %v", nid, err)
 	}
@@ -604,12 +615,12 @@ func (nDB *NetworkDB) LeaveNetwork(nid string) error {
 	defer nDB.Unlock()
 
 	// Remove myself from the list of the nodes participating to the network
-	nDB.deleteNetworkNode(nid, nDB.config.NodeName)
+	nDB.deleteNetworkNode(nid, nDB.config.NodeID)
 
 	// Update all the local entries marking them for deletion and delete all the remote entries
-	nDB.deleteNodeNetworkEntries(nid, nDB.config.NodeName)
+	nDB.deleteNodeNetworkEntries(nid, nDB.config.NodeID)
 
-	nodeNetworks, ok := nDB.networks[nDB.config.NodeName]
+	nodeNetworks, ok := nDB.networks[nDB.config.NodeID]
 	if !ok {
 		return fmt.Errorf("could not find self node for network %s while trying to leave", nid)
 	}
@@ -619,7 +630,7 @@ func (nDB *NetworkDB) LeaveNetwork(nid string) error {
 		return fmt.Errorf("could not find network %s while trying to leave", nid)
 	}
 
-	logrus.Debugf("%s: leaving network %s", nDB.config.NodeName, nid)
+	logrus.Debugf("%v(%v): leaving network %s", nDB.config.Hostname, nDB.config.NodeID, nid)
 	n.ltime = ltime
 	n.reapTime = reapNetworkInterval
 	n.leaving = true
@@ -665,7 +676,7 @@ func (nDB *NetworkDB) findCommonNetworks(nodeName string) []string {
 	defer nDB.RUnlock()
 
 	var networks []string
-	for nid := range nDB.networks[nDB.config.NodeName] {
+	for nid := range nDB.networks[nDB.config.NodeID] {
 		if n, ok := nDB.networks[nodeName][nid]; ok {
 			if !n.leaving {
 				networks = append(networks, nid)
@@ -681,7 +692,7 @@ func (nDB *NetworkDB) updateLocalNetworkTime() {
 	defer nDB.Unlock()
 
 	ltime := nDB.networkClock.Increment()
-	for _, n := range nDB.networks[nDB.config.NodeName] {
+	for _, n := range nDB.networks[nDB.config.NodeID] {
 		n.ltime = ltime
 	}
 }
@@ -693,7 +704,7 @@ func (nDB *NetworkDB) createOrUpdateEntry(nid, tname, key string, entry interfac
 	_, okNetwork := nDB.indexes[byNetwork].Insert(fmt.Sprintf("/%s/%s/%s", nid, tname, key), entry)
 	if !okNetwork {
 		// Add only if it is an insert not an update
-		n, ok := nDB.networks[nDB.config.NodeName][nid]
+		n, ok := nDB.networks[nDB.config.NodeID][nid]
 		if ok {
 			n.entriesNumber++
 		}
@@ -708,7 +719,7 @@ func (nDB *NetworkDB) deleteEntry(nid, tname, key string) (bool, bool) {
 	_, okNetwork := nDB.indexes[byNetwork].Delete(fmt.Sprintf("/%s/%s/%s", nid, tname, key))
 	if okNetwork {
 		// Remove only if the delete is successful
-		n, ok := nDB.networks[nDB.config.NodeName][nid]
+		n, ok := nDB.networks[nDB.config.NodeID][nid]
 		if ok {
 			n.entriesNumber--
 		}

+ 1 - 6
vendor/github.com/docker/libnetwork/osl/namespace_windows.go

@@ -5,12 +5,7 @@ import "testing"
 // GenerateKey generates a sandbox key based on the passed
 // container id.
 func GenerateKey(containerID string) string {
-	maxLen := 12
-	if len(containerID) < maxLen {
-		maxLen = len(containerID)
-	}
-
-	return containerID[:maxLen]
+	return containerID
 }
 
 // NewSandbox provides a new sandbox instance created in an os specific way

+ 26 - 11
vendor/github.com/docker/libnetwork/osl/neigh_linux.go

@@ -9,6 +9,17 @@ import (
 	"github.com/vishvananda/netlink"
 )
 
+// NeighborSearchError indicates that the neighbor is already present
+type NeighborSearchError struct {
+	ip      net.IP
+	mac     net.HardwareAddr
+	present bool
+}
+
+func (n NeighborSearchError) Error() string {
+	return fmt.Sprintf("Search neighbor failed for IP %v, mac %v, present in db:%t", n.ip, n.mac, n.present)
+}
+
 // NeighOption is a function option type to set interface options
 type NeighOption func(nh *neigh)
 
@@ -41,7 +52,7 @@ func (n *networkNamespace) DeleteNeighbor(dstIP net.IP, dstMac net.HardwareAddr,
 
 	nh := n.findNeighbor(dstIP, dstMac)
 	if nh == nil {
-		return fmt.Errorf("could not find the neighbor entry to delete")
+		return NeighborSearchError{dstIP, dstMac, false}
 	}
 
 	if osDelete {
@@ -103,26 +114,27 @@ func (n *networkNamespace) DeleteNeighbor(dstIP net.IP, dstMac net.HardwareAddr,
 		}
 	}
 	n.Unlock()
-	logrus.Debugf("Neighbor entry deleted for IP %v, mac %v", dstIP, dstMac)
+	logrus.Debugf("Neighbor entry deleted for IP %v, mac %v osDelete:%t", dstIP, dstMac, osDelete)
 
 	return nil
 }
 
 func (n *networkNamespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, force bool, options ...NeighOption) error {
 	var (
-		iface netlink.Link
-		err   error
+		iface                  netlink.Link
+		err                    error
+		neighborAlreadyPresent bool
 	)
 
 	// If the namespace already has the neighbor entry but the AddNeighbor is called
 	// because of a miss notification (force flag) program the kernel anyway.
 	nh := n.findNeighbor(dstIP, dstMac)
 	if nh != nil {
+		neighborAlreadyPresent = true
+		logrus.Warnf("Neighbor entry already present for IP %v, mac %v neighbor:%+v forceUpdate:%t", dstIP, dstMac, nh, force)
 		if !force {
-			logrus.Warnf("Neighbor entry already present for IP %v, mac %v", dstIP, dstMac)
-			return nil
+			return NeighborSearchError{dstIP, dstMac, true}
 		}
-		logrus.Warnf("Force kernel update, Neighbor entry already present for IP %v, mac %v", dstIP, dstMac)
 	}
 
 	nh = &neigh{
@@ -146,8 +158,7 @@ func (n *networkNamespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, fo
 	if nh.linkDst != "" {
 		iface, err = nlh.LinkByName(nh.linkDst)
 		if err != nil {
-			return fmt.Errorf("could not find interface with destination name %s: %v",
-				nh.linkDst, err)
+			return fmt.Errorf("could not find interface with destination name %s: %v", nh.linkDst, err)
 		}
 	}
 
@@ -167,13 +178,17 @@ func (n *networkNamespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, fo
 	}
 
 	if err := nlh.NeighSet(nlnh); err != nil {
-		return fmt.Errorf("could not add neighbor entry: %v", err)
+		return fmt.Errorf("could not add neighbor entry:%+v error:%v", nlnh, err)
+	}
+
+	if neighborAlreadyPresent {
+		return nil
 	}
 
 	n.Lock()
 	n.neighbors = append(n.neighbors, nh)
 	n.Unlock()
-	logrus.Debugf("Neighbor entry added for IP %v, mac %v", dstIP, dstMac)
+	logrus.Debugf("Neighbor entry added for IP:%v, mac:%v on ifc:%s", dstIP, dstMac, nh.linkName)
 
 	return nil
 }

+ 33 - 3
vendor/github.com/docker/swarmkit/manager/allocator/cnmallocator/networkallocator.go

@@ -574,6 +574,7 @@ func (na *cnmNetworkAllocator) releaseEndpoints(networks []*api.NetworkAttachmen
 
 // allocate virtual IP for a single endpoint attachment of the service.
 func (na *cnmNetworkAllocator) allocateVIP(vip *api.Endpoint_VirtualIP) error {
+	var opts map[string]string
 	localNet := na.getNetwork(vip.NetworkID)
 	if localNet == nil {
 		return errors.New("networkallocator: could not find local network state")
@@ -603,9 +604,13 @@ func (na *cnmNetworkAllocator) allocateVIP(vip *api.Endpoint_VirtualIP) error {
 			return err
 		}
 	}
+	if localNet.nw.IPAM != nil && localNet.nw.IPAM.Driver != nil {
+		// set ipam allocation method to serial
+		opts = setIPAMSerialAlloc(localNet.nw.IPAM.Driver.Options)
+	}
 
 	for _, poolID := range localNet.pools {
-		ip, _, err := ipam.RequestAddress(poolID, addr, nil)
+		ip, _, err := ipam.RequestAddress(poolID, addr, opts)
 		if err != nil && err != ipamapi.ErrNoAvailableIPs && err != ipamapi.ErrIPOutOfRange {
 			return errors.Wrap(err, "could not allocate VIP from IPAM")
 		}
@@ -657,6 +662,7 @@ func (na *cnmNetworkAllocator) deallocateVIP(vip *api.Endpoint_VirtualIP) error
 // allocate the IP addresses for a single network attachment of the task.
 func (na *cnmNetworkAllocator) allocateNetworkIPs(nAttach *api.NetworkAttachment) error {
 	var ip *net.IPNet
+	var opts map[string]string
 
 	ipam, _, _, err := na.resolveIPAM(nAttach.Network)
 	if err != nil {
@@ -686,11 +692,16 @@ func (na *cnmNetworkAllocator) allocateNetworkIPs(nAttach *api.NetworkAttachment
 				}
 			}
 		}
+		// Set the ipam options if the network has an ipam driver.
+		if localNet.nw.IPAM != nil && localNet.nw.IPAM.Driver != nil {
+			// set ipam allocation method to serial
+			opts = setIPAMSerialAlloc(localNet.nw.IPAM.Driver.Options)
+		}
 
 		for _, poolID := range localNet.pools {
 			var err error
 
-			ip, _, err = ipam.RequestAddress(poolID, addr, nil)
+			ip, _, err = ipam.RequestAddress(poolID, addr, opts)
 			if err != nil && err != ipamapi.ErrNoAvailableIPs && err != ipamapi.ErrIPOutOfRange {
 				return errors.Wrap(err, "could not allocate IP from IPAM")
 			}
@@ -918,8 +929,16 @@ func (na *cnmNetworkAllocator) allocatePools(n *api.Network) (map[string]string,
 			}
 			gwIP.IP = ip
 		}
+		if dOptions == nil {
+			dOptions = make(map[string]string)
+		}
+		dOptions[ipamapi.RequestAddressType] = netlabel.Gateway
+		// set ipam allocation method to serial
+		dOptions = setIPAMSerialAlloc(dOptions)
+		defer delete(dOptions, ipamapi.RequestAddressType)
+
 		if ic.Gateway != "" || gwIP == nil {
-			gwIP, _, err = ipam.RequestAddress(poolID, net.ParseIP(ic.Gateway), map[string]string{ipamapi.RequestAddressType: netlabel.Gateway})
+			gwIP, _, err = ipam.RequestAddress(poolID, net.ParseIP(ic.Gateway), dOptions)
 			if err != nil {
 				// Rollback by releasing all the resources allocated so far.
 				releasePools(ipam, ipamConfigs[:i], pools)
@@ -980,3 +999,14 @@ func IsBuiltInDriver(name string) bool {
 	}
 	return false
 }
+
+// setIPAMSerialAlloc sets the ipam allocation method to serial
+func setIPAMSerialAlloc(opts map[string]string) map[string]string {
+	if opts == nil {
+		opts = make(map[string]string)
+	}
+	if _, ok := opts[ipamapi.AllocSerialPrefix]; !ok {
+		opts[ipamapi.AllocSerialPrefix] = "true"
+	}
+	return opts
+}

+ 1 - 1
vendor/github.com/docker/swarmkit/manager/allocator/cnmallocator/portallocator.go

@@ -382,7 +382,7 @@ func (ps *portSpace) allocate(p *api.PortConfig) (err error) {
 	}
 
 	// Check out an arbitrary port from dynamic port space.
-	swarmPort, err := ps.dynamicPortSpace.GetID()
+	swarmPort, err := ps.dynamicPortSpace.GetID(true)
 	if err != nil {
 		return
 	}

+ 34 - 9
vendor/github.com/docker/swarmkit/manager/state/raft/raft.go

@@ -542,6 +542,7 @@ func (n *Node) Run(ctx context.Context) error {
 		n.done()
 	}()
 
+	// Flag that indicates if this manager node is *currently* the raft leader.
 	wasLeader := false
 	transferLeadershipLimit := rate.NewLimiter(rate.Every(time.Minute), 1)
 
@@ -563,10 +564,13 @@ func (n *Node) Run(ctx context.Context) error {
 				return errors.Wrap(err, "failed to save entries to storage")
 			}
 
+			// If the memory store lock has been held for too long,
+			// transferring leadership is an easy way to break out of it.
 			if wasLeader &&
 				(rd.SoftState == nil || rd.SoftState.RaftState == raft.StateLeader) &&
 				n.memoryStore.Wedged() &&
 				transferLeadershipLimit.Allow() {
+				log.G(ctx).Error("Attempting to transfer leadership")
 				if !n.opts.DisableStackDump {
 					signal.DumpStacks("")
 				}
@@ -612,6 +616,8 @@ func (n *Node) Run(ctx context.Context) error {
 			if rd.SoftState != nil {
 				if wasLeader && rd.SoftState.RaftState != raft.StateLeader {
 					wasLeader = false
+					log.G(ctx).Error("soft state changed, node no longer a leader, resetting and cancelling all waits")
+
 					if atomic.LoadUint32(&n.signalledLeadership) == 1 {
 						atomic.StoreUint32(&n.signalledLeadership, 0)
 						n.leadershipBroadcast.Publish(IsFollower)
@@ -630,6 +636,7 @@ func (n *Node) Run(ctx context.Context) error {
 					// cancelAll, or by its own check of signalledLeadership.
 					n.wait.cancelAll()
 				} else if !wasLeader && rd.SoftState.RaftState == raft.StateLeader {
+					// Node just became a leader.
 					wasLeader = true
 				}
 			}
@@ -1478,7 +1485,7 @@ func (n *Node) registerNode(node *api.RaftMember) error {
 	return nil
 }
 
-// ProposeValue calls Propose on the raft and waits
+// ProposeValue calls Propose on the underlying raft library(etcd/raft) and waits
 // on the commit log action before returning a result
 func (n *Node) ProposeValue(ctx context.Context, storeAction []api.StoreAction, cb func()) error {
 	ctx, cancel := n.WithContext(ctx)
@@ -1654,11 +1661,14 @@ func (n *Node) saveToStorage(
 	return nil
 }
 
-// processInternalRaftRequest sends a message to nodes participating
-// in the raft to apply a log entry and then waits for it to be applied
-// on the server. It will block until the update is performed, there is
-// an error or until the raft node finalizes all the proposals on node
-// shutdown.
+// processInternalRaftRequest proposes a value to be appended to the raft log.
+// It calls Propose() on etcd/raft, which calls back into the raft FSM,
+// which then sends a message to each of the participating nodes
+// in the raft group to apply a log entry and then waits for it to be applied
+// on this node. It will block until the this node:
+// 1. Gets the necessary replies back from the participating nodes and also performs the commit itself, or
+// 2. There is an error, or
+// 3. Until the raft node finalizes all the proposals on node shutdown.
 func (n *Node) processInternalRaftRequest(ctx context.Context, r *api.InternalRaftRequest, cb func()) (proto.Message, error) {
 	n.stopMu.RLock()
 	if !n.IsMember() {
@@ -1679,6 +1689,7 @@ func (n *Node) processInternalRaftRequest(ctx context.Context, r *api.InternalRa
 
 	// Do this check after calling register to avoid a race.
 	if atomic.LoadUint32(&n.signalledLeadership) != 1 {
+		log.G(ctx).Error("node is no longer leader, aborting propose")
 		n.wait.cancel(r.ID)
 		return nil, ErrLostLeadership
 	}
@@ -1703,14 +1714,23 @@ func (n *Node) processInternalRaftRequest(ctx context.Context, r *api.InternalRa
 	select {
 	case x, ok := <-ch:
 		if !ok {
+			// Wait notification channel was closed. This should only happen if the wait was cancelled.
+			log.G(ctx).Error("wait cancelled")
+			if atomic.LoadUint32(&n.signalledLeadership) == 1 {
+				log.G(ctx).Error("wait cancelled but node is still a leader")
+			}
 			return nil, ErrLostLeadership
 		}
 		return x.(proto.Message), nil
 	case <-waitCtx.Done():
 		n.wait.cancel(r.ID)
-		// if channel is closed, wait item was canceled, otherwise it was triggered
+		// If we can read from the channel, wait item was triggered. Otherwise it was cancelled.
 		x, ok := <-ch
 		if !ok {
+			log.G(ctx).WithError(waitCtx.Err()).Error("wait context cancelled")
+			if atomic.LoadUint32(&n.signalledLeadership) == 1 {
+				log.G(ctx).Error("wait context cancelled but node is still a leader")
+			}
 			return nil, ErrLostLeadership
 		}
 		return x.(proto.Message), nil
@@ -1779,21 +1799,26 @@ func (n *Node) processEntry(ctx context.Context, entry raftpb.Entry) error {
 	}
 
 	if !n.wait.trigger(r.ID, r) {
+		log.G(ctx).Errorf("wait not found for raft request id %x", r.ID)
+
 		// There was no wait on this ID, meaning we don't have a
 		// transaction in progress that would be committed to the
 		// memory store by the "trigger" call. Either a different node
 		// wrote this to raft, or we wrote it before losing the leader
-		// position and cancelling the transaction. Create a new
-		// transaction to commit the data.
+		// position and cancelling the transaction. This entry still needs
+		// to be committed since other nodes have already committed it.
+		// Create a new transaction to commit this entry.
 
 		// It should not be possible for processInternalRaftRequest
 		// to be running in this situation, but out of caution we
 		// cancel any current invocations to avoid a deadlock.
+		// TODO(anshul) This call is likely redundant, remove after consideration.
 		n.wait.cancelAll()
 
 		err := n.memoryStore.ApplyStoreActions(r.Action)
 		if err != nil {
 			log.G(ctx).WithError(err).Error("failed to apply actions from raft")
+			// TODO(anshul) return err here ?
 		}
 	}
 	return nil

+ 1 - 2
vendor/github.com/docker/swarmkit/manager/state/store/memory.go

@@ -83,8 +83,7 @@ func register(os ObjectStoreConfig) {
 	schema.Tables[os.Table.Name] = os.Table
 }
 
-// timedMutex wraps a sync.Mutex, and keeps track of how long it has been
-// locked.
+// timedMutex wraps a sync.Mutex, and keeps track of when it was locked.
 type timedMutex struct {
 	sync.Mutex
 	lockedAt atomic.Value

+ 1 - 1
vendor/github.com/docker/swarmkit/vendor.conf

@@ -24,7 +24,7 @@ github.com/docker/go-connections 3ede32e2033de7505e6500d6c868c2b9ed9f169d
 github.com/docker/go-events 9461782956ad83b30282bf90e31fa6a70c255ba9
 github.com/docker/go-units 954fed01cc617c55d838fa2230073f2cb17386c8
 github.com/docker/libkv 9fd56606e928ff1f309808f5d5a0b7a2ef73f9a8
-github.com/docker/libnetwork 19ac3ea7f52bb46e0eb10669756cdae0c441a5b1 
+github.com/docker/libnetwork 21544598c53fa36a3c771a8725c643dd2340f845 
 github.com/docker/libtrust 9cbd2a1374f46905c68a4eb3694a130610adc62a
 github.com/opencontainers/runc d40db12e72a40109dfcf28539f5ee0930d2f0277
 github.com/opencontainers/go-digest 21dfd564fd89c944783d00d069f33e3e7123c448