Merge pull request #675 from mrjana/model
Make sandbox cleanup robust for ungraceful exits
This commit is contained in:
commit
9145f18132
4 changed files with 75 additions and 7 deletions
|
@ -66,6 +66,7 @@ type sandbox struct {
|
|||
joinLeaveDone chan struct{}
|
||||
dbIndex uint64
|
||||
dbExists bool
|
||||
inDelete bool
|
||||
sync.Mutex
|
||||
}
|
||||
|
||||
|
@ -146,6 +147,22 @@ func (sb *sandbox) Statistics() (map[string]*types.InterfaceStatistics, error) {
|
|||
}
|
||||
|
||||
func (sb *sandbox) Delete() error {
|
||||
sb.Lock()
|
||||
if sb.inDelete {
|
||||
sb.Unlock()
|
||||
return types.ForbiddenErrorf("another sandbox delete in progress")
|
||||
}
|
||||
// Set the inDelete flag. This will ensure that we don't
|
||||
// update the store until we have completed all the endpoint
|
||||
// leaves and deletes. And when endpoint leaves and deletes
|
||||
// are completed then we can finally delete the sandbox object
|
||||
// altogether from the data store. If the daemon exits
|
||||
// ungracefully in the middle of a sandbox delete this way we
|
||||
// will have all the references to the endpoints in the
|
||||
// sandbox so that we can clean them up when we restart
|
||||
sb.inDelete = true
|
||||
sb.Unlock()
|
||||
|
||||
c := sb.controller
|
||||
|
||||
// Detach from all endpoints
|
||||
|
@ -355,6 +372,10 @@ func releaseOSSboxResources(osSbox osl.Sandbox, ep *endpoint) {
|
|||
joinInfo := ep.joinInfo
|
||||
ep.Unlock()
|
||||
|
||||
if joinInfo == nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Remove non-interface routes.
|
||||
for _, r := range joinInfo.StaticRoutes {
|
||||
if err := osSbox.RemoveStaticRoute(r); err != nil {
|
||||
|
@ -386,6 +407,7 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error {
|
|||
sb.Unlock()
|
||||
return nil
|
||||
}
|
||||
inDelete := sb.inDelete
|
||||
sb.Unlock()
|
||||
|
||||
ep.Lock()
|
||||
|
@ -425,7 +447,16 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error {
|
|||
}
|
||||
}
|
||||
}
|
||||
return sb.storeUpdate()
|
||||
|
||||
// Only update the store if we did not come here as part of
|
||||
// sandbox delete. If we came here as part of delete then do
|
||||
// not bother updating the store. The sandbox object will be
|
||||
// deleted anyway
|
||||
if !inDelete {
|
||||
return sb.storeUpdate()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {
|
||||
|
@ -437,6 +468,7 @@ func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {
|
|||
|
||||
sb.Lock()
|
||||
osSbox := sb.osSbox
|
||||
inDelete := sb.inDelete
|
||||
sb.Unlock()
|
||||
if osSbox != nil {
|
||||
releaseOSSboxResources(osSbox, ep)
|
||||
|
@ -480,7 +512,15 @@ func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {
|
|||
sb.updateGateway(gwepAfter)
|
||||
}
|
||||
|
||||
return sb.storeUpdate()
|
||||
// Only update the store if we did not come here as part of
|
||||
// sandbox delete. If we came here as part of delete then do
|
||||
// not bother updating the store. The sandbox object will be
|
||||
// deleted anyway
|
||||
if !inDelete {
|
||||
return sb.storeUpdate()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
const (
|
||||
|
|
|
@ -123,6 +123,8 @@ func (sb *sandbox) storeUpdate() error {
|
|||
ID: sb.id,
|
||||
}
|
||||
|
||||
retry:
|
||||
sbs.Eps = nil
|
||||
for _, ep := range sb.getConnectedEndpoints() {
|
||||
eps := epState{
|
||||
Nid: ep.getNetwork().ID(),
|
||||
|
@ -132,7 +134,16 @@ func (sb *sandbox) storeUpdate() error {
|
|||
sbs.Eps = append(sbs.Eps, eps)
|
||||
}
|
||||
|
||||
return sb.controller.updateToStore(sbs)
|
||||
err := sb.controller.updateToStore(sbs)
|
||||
if err == datastore.ErrKeyModified {
|
||||
// When we get ErrKeyModified it is sufficient to just
|
||||
// go back and retry. No need to get the object from
|
||||
// the store because we always regenerate the store
|
||||
// state from in memory sandbox state
|
||||
goto retry
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (sb *sandbox) storeDelete() error {
|
||||
|
|
|
@ -64,6 +64,23 @@ function test_single_network_connectivity() {
|
|||
done
|
||||
}
|
||||
|
||||
@test "Test default network dnet ungraceful restart" {
|
||||
skip_for_circleci
|
||||
|
||||
echo $(docker ps)
|
||||
|
||||
for iter in `seq 1 2`;
|
||||
do
|
||||
if [ "$iter" -eq 1 ]; then
|
||||
test_single_network_connectivity bridge 3 skip
|
||||
docker restart dnet-1-bridge
|
||||
wait_for_dnet $(inst_id2port 1) dnet-1-bridge
|
||||
else
|
||||
test_single_network_connectivity bridge 3
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
@test "Test bridge network" {
|
||||
skip_for_circleci
|
||||
|
||||
|
|
|
@ -99,7 +99,7 @@ function run_dnet_tests() {
|
|||
./integration-tmp/bin/bats ./test/integration/dnet/dnet.bats
|
||||
}
|
||||
|
||||
function run_simple_tests() {
|
||||
function run_simple_consul_tests() {
|
||||
# Test a single node configuration with a global scope test driver
|
||||
## Setup
|
||||
start_dnet 1 simple 1>>${INTEGRATION_ROOT}/test.log 2>&1
|
||||
|
@ -205,15 +205,15 @@ if [ -z "$SUITES" ]; then
|
|||
then
|
||||
# We can only run a limited list of suites in circleci because of the
|
||||
# old kernel and limited docker environment.
|
||||
suites="dnet simple multi_consul multi_zk multi_etcd"
|
||||
suites="dnet simple_consul multi_consul multi_zk multi_etcd"
|
||||
else
|
||||
suites="dnet simple multi_consul multi_zk multi_etcd bridge overlay_consul overlay_zk overlay_etcd"
|
||||
suites="dnet simple_consul multi_consul multi_zk multi_etcd bridge overlay_consul overlay_zk overlay_etcd"
|
||||
fi
|
||||
else
|
||||
suites="$SUITES"
|
||||
fi
|
||||
|
||||
if [[ "$suites" =~ .*consul.* ]]; then
|
||||
if [[ ( "$suites" =~ .*consul.* ) || ( "$suites" =~ .*bridge.* ) ]]; then
|
||||
echo "Starting consul ..."
|
||||
start_consul 1>>${INTEGRATION_ROOT}/test.log 2>&1
|
||||
cmap[pr_consul]=pr_consul
|
||||
|
|
Loading…
Add table
Reference in a new issue