diff --git a/libnetwork/sandbox.go b/libnetwork/sandbox.go index 454ed57715..7b30ade614 100644 --- a/libnetwork/sandbox.go +++ b/libnetwork/sandbox.go @@ -66,6 +66,7 @@ type sandbox struct { joinLeaveDone chan struct{} dbIndex uint64 dbExists bool + inDelete bool sync.Mutex } @@ -146,6 +147,22 @@ func (sb *sandbox) Statistics() (map[string]*types.InterfaceStatistics, error) { } func (sb *sandbox) Delete() error { + sb.Lock() + if sb.inDelete { + sb.Unlock() + return types.ForbiddenErrorf("another sandbox delete in progress") + } + // Set the inDelete flag. This will ensure that we don't + // update the store until we have completed all the endpoint + // leaves and deletes. And when endpoint leaves and deletes + // are completed then we can finally delete the sandbox object + // altogether from the data store. If the daemon exits + // ungracefully in the middle of a sandbox delete this way we + // will have all the references to the endpoints in the + // sandbox so that we can clean them up when we restart + sb.inDelete = true + sb.Unlock() + c := sb.controller // Detach from all endpoints @@ -355,6 +372,10 @@ func releaseOSSboxResources(osSbox osl.Sandbox, ep *endpoint) { joinInfo := ep.joinInfo ep.Unlock() + if joinInfo == nil { + return + } + // Remove non-interface routes. for _, r := range joinInfo.StaticRoutes { if err := osSbox.RemoveStaticRoute(r); err != nil { @@ -386,6 +407,7 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error { sb.Unlock() return nil } + inDelete := sb.inDelete sb.Unlock() ep.Lock() @@ -425,7 +447,16 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error { } } } - return sb.storeUpdate() + + // Only update the store if we did not come here as part of + // sandbox delete. If we came here as part of delete then do + // not bother updating the store. The sandbox object will be + // deleted anyway + if !inDelete { + return sb.storeUpdate() + } + + return nil } func (sb *sandbox) clearNetworkResources(origEp *endpoint) error { @@ -437,6 +468,7 @@ func (sb *sandbox) clearNetworkResources(origEp *endpoint) error { sb.Lock() osSbox := sb.osSbox + inDelete := sb.inDelete sb.Unlock() if osSbox != nil { releaseOSSboxResources(osSbox, ep) @@ -480,7 +512,15 @@ func (sb *sandbox) clearNetworkResources(origEp *endpoint) error { sb.updateGateway(gwepAfter) } - return sb.storeUpdate() + // Only update the store if we did not come here as part of + // sandbox delete. If we came here as part of delete then do + // not bother updating the store. The sandbox object will be + // deleted anyway + if !inDelete { + return sb.storeUpdate() + } + + return nil } const ( diff --git a/libnetwork/sandbox_store.go b/libnetwork/sandbox_store.go index 2a86cbc4cc..0b8f0c95e3 100644 --- a/libnetwork/sandbox_store.go +++ b/libnetwork/sandbox_store.go @@ -123,6 +123,8 @@ func (sb *sandbox) storeUpdate() error { ID: sb.id, } +retry: + sbs.Eps = nil for _, ep := range sb.getConnectedEndpoints() { eps := epState{ Nid: ep.getNetwork().ID(), @@ -132,7 +134,16 @@ func (sb *sandbox) storeUpdate() error { sbs.Eps = append(sbs.Eps, eps) } - return sb.controller.updateToStore(sbs) + err := sb.controller.updateToStore(sbs) + if err == datastore.ErrKeyModified { + // When we get ErrKeyModified it is sufficient to just + // go back and retry. No need to get the object from + // the store because we always regenerate the store + // state from in memory sandbox state + goto retry + } + + return err } func (sb *sandbox) storeDelete() error { diff --git a/libnetwork/test/integration/dnet/bridge.bats b/libnetwork/test/integration/dnet/bridge.bats index 325110fab9..eff60eee6f 100644 --- a/libnetwork/test/integration/dnet/bridge.bats +++ b/libnetwork/test/integration/dnet/bridge.bats @@ -64,6 +64,23 @@ function test_single_network_connectivity() { done } +@test "Test default network dnet ungraceful restart" { + skip_for_circleci + + echo $(docker ps) + + for iter in `seq 1 2`; + do + if [ "$iter" -eq 1 ]; then + test_single_network_connectivity bridge 3 skip + docker restart dnet-1-bridge + wait_for_dnet $(inst_id2port 1) dnet-1-bridge + else + test_single_network_connectivity bridge 3 + fi + done +} + @test "Test bridge network" { skip_for_circleci diff --git a/libnetwork/test/integration/dnet/run-integration-tests.sh b/libnetwork/test/integration/dnet/run-integration-tests.sh index 45123cacbb..9184025c25 100755 --- a/libnetwork/test/integration/dnet/run-integration-tests.sh +++ b/libnetwork/test/integration/dnet/run-integration-tests.sh @@ -99,7 +99,7 @@ function run_dnet_tests() { ./integration-tmp/bin/bats ./test/integration/dnet/dnet.bats } -function run_simple_tests() { +function run_simple_consul_tests() { # Test a single node configuration with a global scope test driver ## Setup start_dnet 1 simple 1>>${INTEGRATION_ROOT}/test.log 2>&1 @@ -205,15 +205,15 @@ if [ -z "$SUITES" ]; then then # We can only run a limited list of suites in circleci because of the # old kernel and limited docker environment. - suites="dnet simple multi_consul multi_zk multi_etcd" + suites="dnet simple_consul multi_consul multi_zk multi_etcd" else - suites="dnet simple multi_consul multi_zk multi_etcd bridge overlay_consul overlay_zk overlay_etcd" + suites="dnet simple_consul multi_consul multi_zk multi_etcd bridge overlay_consul overlay_zk overlay_etcd" fi else suites="$SUITES" fi -if [[ "$suites" =~ .*consul.* ]]; then +if [[ ( "$suites" =~ .*consul.* ) || ( "$suites" =~ .*bridge.* ) ]]; then echo "Starting consul ..." start_consul 1>>${INTEGRATION_ROOT}/test.log 2>&1 cmap[pr_consul]=pr_consul