Make sandbox cleanup robust for ungraceful exits

When the daemon has a lot of containers and even when
the daemon tries to give 15 second to stop all containers
it is not enough. So the daemon forces a shut down at the end
of 15 seconds. And hence in a situation with a lot of
containers even gracefully bringing down the daemon will result
in a lot of containers fully not brought down.

In addition to this the daemon force killing itself can happen
in any arbitrary point in time which will result in inconsistent
checkpointed state for the sandbox. This makes the cleanup really
fail when we come back up and in many cases because of this
inability to cleanup properly on restart will result in daemon not
able to restart because we are not able to delete the default network.

This commit ensures that the sandbox state stored in the disk is
never inconsistent so that when we come back up we will always be
able to cleanup the sandbox state.

Signed-off-by: Jana Radhakrishnan <mrjana@docker.com>
This commit is contained in:
Jana Radhakrishnan 2015-10-16 18:00:30 -07:00
parent 2509014be8
commit 96d819cb06
2 changed files with 54 additions and 3 deletions

View file

@ -66,6 +66,7 @@ type sandbox struct {
joinLeaveDone chan struct{}
dbIndex uint64
dbExists bool
inDelete bool
sync.Mutex
}
@ -146,6 +147,22 @@ func (sb *sandbox) Statistics() (map[string]*types.InterfaceStatistics, error) {
}
func (sb *sandbox) Delete() error {
sb.Lock()
if sb.inDelete {
sb.Unlock()
return types.ForbiddenErrorf("another sandbox delete in progress")
}
// Set the inDelete flag. This will ensure that we don't
// update the store until we have completed all the endpoint
// leaves and deletes. And when endpoint leaves and deletes
// are completed then we can finally delete the sandbox object
// altogether from the data store. If the daemon exits
// ungracefully in the middle of a sandbox delete this way we
// will have all the references to the endpoints in the
// sandbox so that we can clean them up when we restart
sb.inDelete = true
sb.Unlock()
c := sb.controller
// Detach from all endpoints
@ -355,6 +372,10 @@ func releaseOSSboxResources(osSbox osl.Sandbox, ep *endpoint) {
joinInfo := ep.joinInfo
ep.Unlock()
if joinInfo == nil {
return
}
// Remove non-interface routes.
for _, r := range joinInfo.StaticRoutes {
if err := osSbox.RemoveStaticRoute(r); err != nil {
@ -386,6 +407,7 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error {
sb.Unlock()
return nil
}
inDelete := sb.inDelete
sb.Unlock()
ep.Lock()
@ -425,7 +447,16 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error {
}
}
}
return sb.storeUpdate()
// Only update the store if we did not come here as part of
// sandbox delete. If we came here as part of delete then do
// not bother updating the store. The sandbox object will be
// deleted anyway
if !inDelete {
return sb.storeUpdate()
}
return nil
}
func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {
@ -437,6 +468,7 @@ func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {
sb.Lock()
osSbox := sb.osSbox
inDelete := sb.inDelete
sb.Unlock()
if osSbox != nil {
releaseOSSboxResources(osSbox, ep)
@ -480,7 +512,15 @@ func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {
sb.updateGateway(gwepAfter)
}
return sb.storeUpdate()
// Only update the store if we did not come here as part of
// sandbox delete. If we came here as part of delete then do
// not bother updating the store. The sandbox object will be
// deleted anyway
if !inDelete {
return sb.storeUpdate()
}
return nil
}
const (

View file

@ -123,6 +123,8 @@ func (sb *sandbox) storeUpdate() error {
ID: sb.id,
}
retry:
sbs.Eps = nil
for _, ep := range sb.getConnectedEndpoints() {
eps := epState{
Nid: ep.getNetwork().ID(),
@ -132,7 +134,16 @@ func (sb *sandbox) storeUpdate() error {
sbs.Eps = append(sbs.Eps, eps)
}
return sb.controller.updateToStore(sbs)
err := sb.controller.updateToStore(sbs)
if err == datastore.ErrKeyModified {
// When we get ErrKeyModified it is sufficient to just
// go back and retry. No need to get the object from
// the store because we always regenerate the store
// state from in memory sandbox state
goto retry
}
return err
}
func (sb *sandbox) storeDelete() error {