浏览代码

Cleanup dangling sandboxes on boot up

Currently when docker exits ungracefully it may leave
dangling sandboxes which may hold onto precious network
resources. Added checkpoint state for sandboxes which
on boot up will be used to clean up the sandboxes and
network resources.

On bootup the remaining dangling state in the checkpoint
are read and cleaned up before accepting any new
network allocation requests.

Signed-off-by: Jana Radhakrishnan <mrjana@docker.com>
Jana Radhakrishnan 9 年之前
父节点
当前提交
e41b4765bd

+ 14 - 0
libnetwork/controller.go

@@ -191,6 +191,8 @@ func New(cfgOptions ...config.Option) (NetworkController, error) {
 		return nil, err
 		return nil, err
 	}
 	}
 
 
+	c.sandboxCleanup()
+
 	if err := c.startExternalKeyListener(); err != nil {
 	if err := c.startExternalKeyListener(); err != nil {
 		return nil, err
 		return nil, err
 	}
 	}
@@ -500,6 +502,18 @@ func (c *controller) NewSandbox(containerID string, options ...SandboxOption) (S
 	c.Lock()
 	c.Lock()
 	c.sandboxes[sb.id] = sb
 	c.sandboxes[sb.id] = sb
 	c.Unlock()
 	c.Unlock()
+	defer func() {
+		if err != nil {
+			c.Lock()
+			delete(c.sandboxes, sb.id)
+			c.Unlock()
+		}
+	}()
+
+	err = sb.storeUpdate()
+	if err != nil {
+		return nil, fmt.Errorf("updating the store state of sandbox failed: %v", err)
+	}
 
 
 	return sb, nil
 	return sb, nil
 }
 }

+ 1 - 1
libnetwork/endpoint.go

@@ -221,7 +221,7 @@ func (ep *endpoint) Exists() bool {
 }
 }
 
 
 func (ep *endpoint) Skip() bool {
 func (ep *endpoint) Skip() bool {
-	return ep.getNetwork().Skip() || ep.DataScope() == datastore.LocalScope
+	return ep.getNetwork().Skip()
 }
 }
 
 
 func (ep *endpoint) processOptions(options ...EndpointOption) {
 func (ep *endpoint) processOptions(options ...EndpointOption) {

+ 5 - 69
libnetwork/libnetwork_test.go

@@ -1479,21 +1479,11 @@ func TestLeaveAll(t *testing.T) {
 	if err != nil {
 	if err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
-	defer func() {
-		if err := ep1.Delete(); err != nil {
-			t.Fatal(err)
-		}
-	}()
 
 
 	ep2, err := n.CreateEndpoint("ep2")
 	ep2, err := n.CreateEndpoint("ep2")
 	if err != nil {
 	if err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
-	defer func() {
-		if err := ep2.Delete(); err != nil {
-			t.Fatal(err)
-		}
-	}()
 
 
 	cnt, err := controller.NewSandbox("leaveall")
 	cnt, err := controller.NewSandbox("leaveall")
 	if err != nil {
 	if err != nil {
@@ -1607,21 +1597,11 @@ func TestEndpointUpdateParent(t *testing.T) {
 	if err != nil {
 	if err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
-	defer func() {
-		if err := ep1.Delete(); err != nil {
-			t.Fatal(err)
-		}
-	}()
 
 
 	ep2, err := n.CreateEndpoint("ep2")
 	ep2, err := n.CreateEndpoint("ep2")
 	if err != nil {
 	if err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
-	defer func() {
-		if err := ep2.Delete(); err != nil {
-			t.Fatal(err)
-		}
-	}()
 
 
 	sbx1, err := controller.NewSandbox(containerID,
 	sbx1, err := controller.NewSandbox(containerID,
 		libnetwork.OptionHostname("test"),
 		libnetwork.OptionHostname("test"),
@@ -1661,12 +1641,6 @@ func TestEndpointUpdateParent(t *testing.T) {
 	if err != nil {
 	if err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
-
-	err = ep2.Leave(sbx2)
-	runtime.LockOSThread()
-	if err != nil {
-		t.Fatal(err)
-	}
 }
 }
 
 
 func TestEnableIPv6(t *testing.T) {
 func TestEnableIPv6(t *testing.T) {
@@ -1714,11 +1688,6 @@ func TestEnableIPv6(t *testing.T) {
 	if err != nil {
 	if err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
-	defer func() {
-		if err := ep1.Delete(); err != nil {
-			t.Fatal(err)
-		}
-	}()
 
 
 	if err := ioutil.WriteFile("/etc/resolv.conf", tmpResolvConf, 0644); err != nil {
 	if err := ioutil.WriteFile("/etc/resolv.conf", tmpResolvConf, 0644); err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
@@ -1741,13 +1710,6 @@ func TestEnableIPv6(t *testing.T) {
 	if err != nil {
 	if err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
-	defer func() {
-		err = ep1.Leave(sb)
-		runtime.LockOSThread()
-		if err != nil {
-			t.Fatal(err)
-		}
-	}()
 
 
 	content, err := ioutil.ReadFile(resolvConfPath)
 	content, err := ioutil.ReadFile(resolvConfPath)
 	if err != nil {
 	if err != nil {
@@ -1884,11 +1846,6 @@ func TestResolvConf(t *testing.T) {
 	if err != nil {
 	if err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
-	defer func() {
-		if err := ep.Delete(); err != nil {
-			t.Fatal(err)
-		}
-	}()
 
 
 	if err := ioutil.WriteFile("/etc/resolv.conf", tmpResolvConf1, 0644); err != nil {
 	if err := ioutil.WriteFile("/etc/resolv.conf", tmpResolvConf1, 0644); err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
@@ -2204,24 +2161,9 @@ func parallelLeave(t *testing.T, rc libnetwork.Sandbox, ep libnetwork.Endpoint,
 	debugf("L%d.", thrNumber)
 	debugf("L%d.", thrNumber)
 	var err error
 	var err error
 
 
-	cid := fmt.Sprintf("%drace", thrNumber)
 	sb := sboxes[thrNumber-1]
 	sb := sboxes[thrNumber-1]
 
 
-	if thrNumber == first {
-		err = ep.Leave(sb)
-	} else {
-		err = sb.Delete()
-		// re add sandbox
-		defer func() {
-			if err == nil {
-				var e error
-				if sboxes[thrNumber-1], e = controller.NewSandbox(cid); e != nil {
-					t.Fatalf("Failed to recreate sandbox %s: %v", cid, e)
-				}
-			}
-		}()
-	}
-
+	err = ep.Leave(sb)
 	runtime.LockOSThread()
 	runtime.LockOSThread()
 	if err != nil {
 	if err != nil {
 		if _, ok := err.(types.ForbiddenError); !ok {
 		if _, ok := err.(types.ForbiddenError); !ok {
@@ -2324,11 +2266,10 @@ func runParallelTests(t *testing.T, thrNumber int) {
 
 
 	debugf("\n")
 	debugf("\n")
 
 
-	err = ep.Delete()
+	err = sb.Delete()
 	if err != nil {
 	if err != nil {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
-
 	if thrNumber == first {
 	if thrNumber == first {
 		for thrdone := range done {
 		for thrdone := range done {
 			select {
 			select {
@@ -2337,19 +2278,14 @@ func runParallelTests(t *testing.T, thrNumber int) {
 		}
 		}
 
 
 		testns.Close()
 		testns.Close()
-		err = sb.Delete()
-		if err != nil {
+		if err := net2.Delete(); err != nil {
 			t.Fatal(err)
 			t.Fatal(err)
 		}
 		}
-
-		ep.Delete()
+	} else {
+		err = ep.Delete()
 		if err != nil {
 		if err != nil {
 			t.Fatal(err)
 			t.Fatal(err)
 		}
 		}
-
-		if err := net2.Delete(); err != nil {
-			t.Fatal(err)
-		}
 	}
 	}
 }
 }
 
 

+ 13 - 2
libnetwork/sandbox.go

@@ -64,6 +64,8 @@ type sandbox struct {
 	endpoints     epHeap
 	endpoints     epHeap
 	epPriority    map[string]int
 	epPriority    map[string]int
 	joinLeaveDone chan struct{}
 	joinLeaveDone chan struct{}
+	dbIndex       uint64
+	dbExists      bool
 	sync.Mutex
 	sync.Mutex
 }
 }
 
 
@@ -153,15 +155,24 @@ func (sb *sandbox) Delete() error {
 		if ep.endpointInGWNetwork() {
 		if ep.endpointInGWNetwork() {
 			continue
 			continue
 		}
 		}
+
 		if err := ep.Leave(sb); err != nil {
 		if err := ep.Leave(sb); err != nil {
 			log.Warnf("Failed detaching sandbox %s from endpoint %s: %v\n", sb.ID(), ep.ID(), err)
 			log.Warnf("Failed detaching sandbox %s from endpoint %s: %v\n", sb.ID(), ep.ID(), err)
 		}
 		}
+
+		if err := ep.Delete(); err != nil {
+			log.Warnf("Failed deleting endpoint %s: %v\n", ep.ID(), err)
+		}
 	}
 	}
 
 
 	if sb.osSbox != nil {
 	if sb.osSbox != nil {
 		sb.osSbox.Destroy()
 		sb.osSbox.Destroy()
 	}
 	}
 
 
+	if err := sb.storeDelete(); err != nil {
+		log.Warnf("Failed to delete sandbox %s from store: %v", sb.ID(), err)
+	}
+
 	c.Lock()
 	c.Lock()
 	delete(c.sandboxes, sb.ID())
 	delete(c.sandboxes, sb.ID())
 	c.Unlock()
 	c.Unlock()
@@ -369,7 +380,7 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error {
 			}
 			}
 		}
 		}
 	}
 	}
-	return nil
+	return sb.storeUpdate()
 }
 }
 
 
 func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {
 func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {
@@ -442,7 +453,7 @@ func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {
 		sb.updateGateway(gwepAfter)
 		sb.updateGateway(gwepAfter)
 	}
 	}
 
 
-	return nil
+	return sb.storeUpdate()
 }
 }
 
 
 const (
 const (

+ 212 - 0
libnetwork/sandbox_store.go

@@ -0,0 +1,212 @@
+package libnetwork
+
+import (
+	"container/heap"
+	"encoding/json"
+
+	"github.com/Sirupsen/logrus"
+	"github.com/docker/libnetwork/datastore"
+	"github.com/docker/libnetwork/osl"
+)
+
+const (
+	sandboxPrefix = "sandbox"
+)
+
+type epState struct {
+	Eid string
+	Nid string
+}
+
+type sbState struct {
+	ID       string
+	Cid      string
+	c        *controller
+	dbIndex  uint64
+	dbExists bool
+	Eps      []epState
+}
+
+func (sbs *sbState) Key() []string {
+	return []string{sandboxPrefix, sbs.ID}
+}
+
+func (sbs *sbState) KeyPrefix() []string {
+	return []string{sandboxPrefix}
+}
+
+func (sbs *sbState) Value() []byte {
+	b, err := json.Marshal(sbs)
+	if err != nil {
+		return nil
+	}
+	return b
+}
+
+func (sbs *sbState) SetValue(value []byte) error {
+	return json.Unmarshal(value, sbs)
+}
+
+func (sbs *sbState) Index() uint64 {
+	sbi, err := sbs.c.SandboxByID(sbs.ID)
+	if err != nil {
+		return sbs.dbIndex
+	}
+
+	sb := sbi.(*sandbox)
+	maxIndex := sb.dbIndex
+	if sbs.dbIndex > maxIndex {
+		maxIndex = sbs.dbIndex
+	}
+
+	return maxIndex
+}
+
+func (sbs *sbState) SetIndex(index uint64) {
+	sbs.dbIndex = index
+	sbs.dbExists = true
+
+	sbi, err := sbs.c.SandboxByID(sbs.ID)
+	if err != nil {
+		return
+	}
+
+	sb := sbi.(*sandbox)
+	sb.dbIndex = index
+	sb.dbExists = true
+}
+
+func (sbs *sbState) Exists() bool {
+	if sbs.dbExists {
+		return sbs.dbExists
+	}
+
+	sbi, err := sbs.c.SandboxByID(sbs.ID)
+	if err != nil {
+		return false
+	}
+
+	sb := sbi.(*sandbox)
+	return sb.dbExists
+}
+
+func (sbs *sbState) Skip() bool {
+	return false
+}
+
+func (sbs *sbState) New() datastore.KVObject {
+	return &sbState{c: sbs.c}
+}
+
+func (sbs *sbState) CopyTo(o datastore.KVObject) error {
+	dstSbs := o.(*sbState)
+	dstSbs.c = sbs.c
+	dstSbs.ID = sbs.ID
+	dstSbs.Cid = sbs.Cid
+	dstSbs.dbIndex = sbs.dbIndex
+	dstSbs.dbExists = sbs.dbExists
+
+	for _, eps := range sbs.Eps {
+		dstSbs.Eps = append(dstSbs.Eps, eps)
+	}
+
+	return nil
+}
+
+func (sbs *sbState) DataScope() string {
+	return datastore.LocalScope
+}
+
+func (sb *sandbox) storeUpdate() error {
+	sbs := &sbState{
+		c:  sb.controller,
+		ID: sb.id,
+	}
+
+	for _, ep := range sb.getConnectedEndpoints() {
+		eps := epState{
+			Nid: ep.getNetwork().ID(),
+			Eid: ep.ID(),
+		}
+
+		sbs.Eps = append(sbs.Eps, eps)
+	}
+
+	return sb.controller.updateToStore(sbs)
+}
+
+func (sb *sandbox) storeDelete() error {
+	sbs := &sbState{
+		c:        sb.controller,
+		ID:       sb.id,
+		Cid:      sb.containerID,
+		dbIndex:  sb.dbIndex,
+		dbExists: sb.dbExists,
+	}
+
+	return sb.controller.deleteFromStore(sbs)
+}
+
+func (c *controller) sandboxCleanup() {
+	store := c.getStore(datastore.LocalScope)
+	if store == nil {
+		logrus.Errorf("Could not find local scope store while trying to cleanup sandboxes")
+		return
+	}
+
+	kvol, err := store.List(datastore.Key(sandboxPrefix), &sbState{c: c})
+	if err != nil && err != datastore.ErrKeyNotFound {
+		logrus.Errorf("failed to get sandboxes for scope %s: %v", store.Scope(), err)
+		return
+	}
+
+	// It's normal for no sandboxes to be found. Just bail out.
+	if err == datastore.ErrKeyNotFound {
+		return
+	}
+
+	for _, kvo := range kvol {
+		sbs := kvo.(*sbState)
+
+		logrus.Printf("sandboxcleanup sbs = %+v", sbs)
+		sb := &sandbox{
+			id:          sbs.ID,
+			controller:  sbs.c,
+			containerID: sbs.Cid,
+			endpoints:   epHeap{},
+			epPriority:  map[string]int{},
+			dbIndex:     sbs.dbIndex,
+			dbExists:    true,
+		}
+
+		sb.osSbox, err = osl.NewSandbox(sb.Key(), true)
+		if err != nil {
+			logrus.Errorf("failed to create new osl sandbox while trying to build sandbox for cleanup: %v", err)
+			continue
+		}
+
+		for _, eps := range sbs.Eps {
+			n, err := c.getNetworkFromStore(eps.Nid)
+			if err != nil {
+				logrus.Errorf("getNetworkFromStore for nid %s failed while trying to build sandbox for cleanup: %v", eps.Nid, err)
+				continue
+			}
+
+			ep, err := n.getEndpointFromStore(eps.Eid)
+			if err != nil {
+				logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err)
+				continue
+			}
+
+			heap.Push(&sb.endpoints, ep)
+		}
+
+		c.Lock()
+		c.sandboxes[sb.id] = sb
+		c.Unlock()
+
+		if err := sb.Delete(); err != nil {
+			logrus.Errorf("failed to delete sandbox %s while trying to cleanup: %v", sb.id, err)
+		}
+	}
+}

+ 2 - 2
libnetwork/store_test.go

@@ -76,8 +76,8 @@ func testLocalBackend(t *testing.T, provider, url string, storeConfig *store.Con
 	if exists, err := store.Exists(datastore.Key(datastore.NetworkKeyPrefix, string(nw.ID()))); !exists || err != nil {
 	if exists, err := store.Exists(datastore.Key(datastore.NetworkKeyPrefix, string(nw.ID()))); !exists || err != nil {
 		t.Fatalf("Network key should have been created.")
 		t.Fatalf("Network key should have been created.")
 	}
 	}
-	if exists, err := store.Exists(datastore.Key([]string{datastore.EndpointKeyPrefix, string(nw.ID()), string(ep.ID())}...)); exists || err != nil {
-		t.Fatalf("Endpoint key shouldn't have been created.")
+	if exists, err := store.Exists(datastore.Key([]string{datastore.EndpointKeyPrefix, string(nw.ID()), string(ep.ID())}...)); !exists || err != nil {
+		t.Fatalf("Endpoint key should have been created.")
 	}
 	}
 	ctrl.(*controller).getStore(datastore.LocalScope).Close()
 	ctrl.(*controller).getStore(datastore.LocalScope).Close()
 
 

+ 25 - 0
libnetwork/test/integration/dnet/bridge.bats

@@ -30,6 +30,10 @@ function test_single_network_connectivity() {
 	done
 	done
     done
     done
 
 
+    if [ -n "$3" ]; then
+	return
+    fi
+
     # Teardown the container connections and the network
     # Teardown the container connections and the network
     for i in `seq ${start} ${end}`;
     for i in `seq ${start} ${end}`;
     do
     do
@@ -70,6 +74,27 @@ function test_single_network_connectivity() {
     dnet_cmd $(inst_id2port 1) network rm singlehost
     dnet_cmd $(inst_id2port 1) network rm singlehost
 }
 }
 
 
+@test "Test bridge network dnet ungraceful restart" {
+    skip_for_circleci
+
+    echo $(docker ps)
+    dnet_cmd $(inst_id2port 1) network create -d bridge singlehost
+
+    for iter in `seq 1 2`;
+    do
+	if [ "$iter" -eq 1 ]; then
+	    test_single_network_connectivity singlehost 3 skip
+	else
+	    test_single_network_connectivity singlehost 3
+	fi
+
+	docker restart dnet-1-bridge
+	sleep 5
+    done
+
+    dnet_cmd $(inst_id2port 1) network rm singlehost
+}
+
 @test "Test multiple bridge networks" {
 @test "Test multiple bridge networks" {
     skip_for_circleci
     skip_for_circleci