Make sandbox cleanup robust for ungraceful exits

When the daemon has a lot of containers and even when the daemon tries to give 15 second to stop all containers it is not enough. So the daemon forces a shut down at the end of 15 seconds. And hence in a situation with a lot of containers even gracefully bringing down the daemon will result in a lot of containers fully not brought down. In addition to this the daemon force killing itself can happen in any arbitrary point in time which will result in inconsistent checkpointed state for the sandbox. This makes the cleanup really fail when we come back up and in many cases because of this inability to cleanup properly on restart will result in daemon not able to restart because we are not able to delete the default network. This commit ensures that the sandbox state stored in the disk is never inconsistent so that when we come back up we will always be able to cleanup the sandbox state. Signed-off-by: Jana Radhakrishnan <mrjana@docker.com>
2015-10-16 18:00:30 -07:00 · 2015-10-16 18:00:30 -07:00 · 96d819cb06
commit 96d819cb06
parent 2509014be8
2 changed files with 54 additions and 3 deletions
--- a/libnetwork/sandbox.go
+++ b/libnetwork/sandbox.go
@ -66,6 +66,7 @@ type sandbox struct {
 	joinLeaveDone chan struct{}
 	dbIndex       uint64
 	dbExists      bool
+	inDelete      bool
 	sync.Mutex
 }

@ -146,6 +147,22 @@ func (sb *sandbox) Statistics() (map[string]*types.InterfaceStatistics, error) {
 }

 func (sb *sandbox) Delete() error {
+	sb.Lock()
+	if sb.inDelete {
+		sb.Unlock()
+		return types.ForbiddenErrorf("another sandbox delete in progress")
+	}
+	// Set the inDelete flag. This will ensure that we don't
+	// update the store until we have completed all the endpoint
+	// leaves and deletes. And when endpoint leaves and deletes
+	// are completed then we can finally delete the sandbox object
+	// altogether from the data store. If the daemon exits
+	// ungracefully in the middle of a sandbox delete this way we
+	// will have all the references to the endpoints in the
+	// sandbox so that we can clean them up when we restart
+	sb.inDelete = true
+	sb.Unlock()
+
 	c := sb.controller

 	// Detach from all endpoints
@ -355,6 +372,10 @@ func releaseOSSboxResources(osSbox osl.Sandbox, ep *endpoint) {
 	joinInfo := ep.joinInfo
 	ep.Unlock()

+	if joinInfo == nil {
+		return
+	}
+
 	// Remove non-interface routes.
 	for _, r := range joinInfo.StaticRoutes {
 		if err := osSbox.RemoveStaticRoute(r); err != nil {
@ -386,6 +407,7 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error {
 		sb.Unlock()
 		return nil
 	}
+	inDelete := sb.inDelete
 	sb.Unlock()

 	ep.Lock()
@ -425,7 +447,16 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error {
 			}
 		}
 	}
-	return sb.storeUpdate()
+
+	// Only update the store if we did not come here as part of
+	// sandbox delete. If we came here as part of delete then do
+	// not bother updating the store. The sandbox object will be
+	// deleted anyway
+	if !inDelete {
+		return sb.storeUpdate()
+	}
+
+	return nil
 }

 func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {
@ -437,6 +468,7 @@ func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {

 	sb.Lock()
 	osSbox := sb.osSbox
+	inDelete := sb.inDelete
 	sb.Unlock()
 	if osSbox != nil {
 		releaseOSSboxResources(osSbox, ep)
@ -480,7 +512,15 @@ func (sb *sandbox) clearNetworkResources(origEp *endpoint) error {
 		sb.updateGateway(gwepAfter)
 	}

-	return sb.storeUpdate()
+	// Only update the store if we did not come here as part of
+	// sandbox delete. If we came here as part of delete then do
+	// not bother updating the store. The sandbox object will be
+	// deleted anyway
+	if !inDelete {
+		return sb.storeUpdate()
+	}
+
+	return nil
 }

 const (
--- a/libnetwork/sandbox_store.go
+++ b/libnetwork/sandbox_store.go
@ -123,6 +123,8 @@ func (sb *sandbox) storeUpdate() error {
 		ID: sb.id,
 	}

+retry:
+	sbs.Eps = nil
 	for _, ep := range sb.getConnectedEndpoints() {
 		eps := epState{
 			Nid: ep.getNetwork().ID(),
@ -132,7 +134,16 @@ func (sb *sandbox) storeUpdate() error {
 		sbs.Eps = append(sbs.Eps, eps)
 	}

-	return sb.controller.updateToStore(sbs)
+	err := sb.controller.updateToStore(sbs)
+	if err == datastore.ErrKeyModified {
+		// When we get ErrKeyModified it is sufficient to just
+		// go back and retry.  No need to get the object from
+		// the store because we always regenerate the store
+		// state from in memory sandbox state
+		goto retry
+	}
+
+	return err
 }

 func (sb *sandbox) storeDelete() error {