浏览代码

Merge pull request #675 from mrjana/model

Make sandbox cleanup robust for ungraceful exits
Madhu Venugopal 9 年之前
父节点
当前提交
9145f18132

+ 42 - 2
libnetwork/sandbox.go

@@ -66,6 +66,7 @@ type sandbox struct {
 	joinLeaveDone chan struct{}
 	dbIndex       uint64
 	dbExists      bool
+	inDelete      bool
 	sync.Mutex
 }
 
@@ -146,6 +147,22 @@ func (sb *sandbox) Statistics() (map[string]*types.InterfaceStatistics, error) {
 }
 
 func (sb *sandbox) Delete() error {
+	sb.Lock()
+	if sb.inDelete {
+		sb.Unlock()
+		return types.ForbiddenErrorf("another sandbox delete in progress")
+	}
+	// Set the inDelete flag. This will ensure that we don't
+	// update the store until we have completed all the endpoint
+	// leaves and deletes. And when endpoint leaves and deletes
+	// are completed then we can finally delete the sandbox object
+	// altogether from the data store. If the daemon exits
+	// ungracefully in the middle of a sandbox delete this way we
+	// will have all the references to the endpoints in the
+	// sandbox so that we can clean them up when we restart
+	sb.inDelete = true
+	sb.Unlock()
+
 	c := sb.controller
 
 	// Detach from all endpoints
@@ -355,6 +372,10 @@ func releaseOSSboxResources(osSbox osl.Sandbox, ep *endpoint) {
 	joinInfo := ep.joinInfo
 	ep.Unlock()
 
+	if joinInfo == nil {
+		return
+	}
+
 	// Remove non-interface routes.
 	for _, r := range joinInfo.StaticRoutes {
 		if err := osSbox.RemoveStaticRoute(r); err != nil {
@@ -386,6 +407,7 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error {
 		sb.Unlock()
 		return nil
 	}
+	inDelete := sb.inDelete
 	sb.Unlock()
 
 	ep.Lock()
@@ -425,7 +447,16 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error {
 			}
 		}
 	}
-	return sb.storeUpdate()
+
+	// Only update the store if we did not come here as part of
+	// sandbox delete. If we came here as part of delete then do
+	// not bother updating the store. The sandbox object will be
+	// deleted anyway
+	if !inDelete {
+		return sb.storeUpdate()
+	}
+
+	return nil
 }
 
 func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {
@@ -437,6 +468,7 @@ func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {
 
 	sb.Lock()
 	osSbox := sb.osSbox
+	inDelete := sb.inDelete
 	sb.Unlock()
 	if osSbox != nil {
 		releaseOSSboxResources(osSbox, ep)
@@ -480,7 +512,15 @@ func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {
 		sb.updateGateway(gwepAfter)
 	}
 
-	return sb.storeUpdate()
+	// Only update the store if we did not come here as part of
+	// sandbox delete. If we came here as part of delete then do
+	// not bother updating the store. The sandbox object will be
+	// deleted anyway
+	if !inDelete {
+		return sb.storeUpdate()
+	}
+
+	return nil
 }
 
 const (

+ 12 - 1
libnetwork/sandbox_store.go

@@ -123,6 +123,8 @@ func (sb *sandbox) storeUpdate() error {
 		ID: sb.id,
 	}
 
+retry:
+	sbs.Eps = nil
 	for _, ep := range sb.getConnectedEndpoints() {
 		eps := epState{
 			Nid: ep.getNetwork().ID(),
@@ -132,7 +134,16 @@ func (sb *sandbox) storeUpdate() error {
 		sbs.Eps = append(sbs.Eps, eps)
 	}
 
-	return sb.controller.updateToStore(sbs)
+	err := sb.controller.updateToStore(sbs)
+	if err == datastore.ErrKeyModified {
+		// When we get ErrKeyModified it is sufficient to just
+		// go back and retry.  No need to get the object from
+		// the store because we always regenerate the store
+		// state from in memory sandbox state
+		goto retry
+	}
+
+	return err
 }
 
 func (sb *sandbox) storeDelete() error {

+ 17 - 0
libnetwork/test/integration/dnet/bridge.bats

@@ -64,6 +64,23 @@ function test_single_network_connectivity() {
     done
 }
 
+@test "Test default network dnet ungraceful restart" {
+    skip_for_circleci
+
+    echo $(docker ps)
+
+    for iter in `seq 1 2`;
+    do
+	if [ "$iter" -eq 1 ]; then
+	    test_single_network_connectivity bridge 3 skip
+	    docker restart dnet-1-bridge
+	    wait_for_dnet $(inst_id2port 1) dnet-1-bridge
+	else
+	    test_single_network_connectivity bridge 3
+	fi
+    done
+}
+
 @test "Test bridge network" {
     skip_for_circleci
 

+ 4 - 4
libnetwork/test/integration/dnet/run-integration-tests.sh

@@ -99,7 +99,7 @@ function run_dnet_tests() {
     ./integration-tmp/bin/bats ./test/integration/dnet/dnet.bats
 }
 
-function run_simple_tests() {
+function run_simple_consul_tests() {
     # Test a single node configuration with a global scope test driver
     ## Setup
     start_dnet 1 simple 1>>${INTEGRATION_ROOT}/test.log 2>&1
@@ -205,15 +205,15 @@ if [ -z "$SUITES" ]; then
     then
 	# We can only run a limited list of suites in circleci because of the
 	# old kernel and limited docker environment.
-	suites="dnet simple multi_consul multi_zk multi_etcd"
+	suites="dnet simple_consul multi_consul multi_zk multi_etcd"
     else
-	suites="dnet simple multi_consul multi_zk multi_etcd  bridge overlay_consul overlay_zk overlay_etcd"
+	suites="dnet simple_consul multi_consul multi_zk multi_etcd  bridge overlay_consul overlay_zk overlay_etcd"
     fi
 else
     suites="$SUITES"
 fi
 
-if [[ "$suites" =~ .*consul.* ]]; then
+if [[ ( "$suites" =~ .*consul.* ) ||  ( "$suites" =~ .*bridge.* ) ]]; then
     echo "Starting consul ..."
     start_consul 1>>${INTEGRATION_ROOT}/test.log 2>&1
     cmap[pr_consul]=pr_consul