Fixes a case of ungraceful daemon restart + unreachable store

For ungraceful daemon restarts, libnetwork has sandbox cleanup logic to
remove any stale & dangling resources. But, if the store is down during
the daemon restart, then the cleanup logic would not be able to perform
complete cleanup. During such cases, the sandbox has been removed. With
this fix, we retain the sandbox if the store is down and the endpoint
couldnt be cleaned. When the container is later restarted in docker
daemon, we will perform a sandbox cleanup and that will complete the
cleanup round.

Signed-off-by: Madhu Venugopal <madhu@docker.com>
This commit is contained in:
Madhu Venugopal 2015-10-29 17:16:52 -07:00
parent 99132ffb7f
commit c8a66f5e72
3 changed files with 28 additions and 9 deletions

View file

@ -118,6 +118,12 @@ func (d *driver) Leave(nid, eid string) error {
return fmt.Errorf("could not find network with id %s", nid)
}
ep := n.endpoint(eid)
if ep == nil {
return types.InternalMaskableErrorf("could not find endpoint with id %s", eid)
}
if d.notifyCh != nil {
d.notifyCh <- ovNotify{
action: "leave",

View file

@ -168,6 +168,7 @@ func (sb *sandbox) Delete() error {
c := sb.controller
// Detach from all endpoints
retain := false
for _, ep := range sb.getConnectedEndpoints() {
// endpoint in the Gateway network will be cleaned up
// when when sandbox no longer needs external connectivity
@ -176,14 +177,22 @@ func (sb *sandbox) Delete() error {
}
if err := ep.Leave(sb); err != nil {
retain = true
log.Warnf("Failed detaching sandbox %s from endpoint %s: %v\n", sb.ID(), ep.ID(), err)
}
if err := ep.Delete(); err != nil {
retain = true
log.Warnf("Failed deleting endpoint %s: %v\n", ep.ID(), err)
}
}
if retain {
sb.Lock()
sb.inDelete = false
sb.Unlock()
return fmt.Errorf("could not cleanup all the endpoints in container %s / sandbox %s", sb.containerID, sb.id)
}
// Container is going away. Path cache in etchosts is most
// likely not required any more. Drop it.
etchosts.Drop(sb.config.hostsPath)

View file

@ -3,6 +3,7 @@ package libnetwork
import (
"container/heap"
"encoding/json"
"sync"
"github.com/Sirupsen/logrus"
"github.com/docker/libnetwork/datastore"
@ -119,8 +120,9 @@ func (sbs *sbState) DataScope() string {
func (sb *sandbox) storeUpdate() error {
sbs := &sbState{
c: sb.controller,
ID: sb.id,
c: sb.controller,
ID: sb.id,
Cid: sb.containerID,
}
retry:
@ -197,15 +199,17 @@ func (c *controller) sandboxCleanup() {
for _, eps := range sbs.Eps {
n, err := c.getNetworkFromStore(eps.Nid)
var ep *endpoint
if err != nil {
logrus.Errorf("getNetworkFromStore for nid %s failed while trying to build sandbox for cleanup: %v", eps.Nid, err)
continue
}
ep, err := n.getEndpointFromStore(eps.Eid)
if err != nil {
logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err)
continue
n = &network{id: eps.Nid, ctrlr: c, drvOnce: &sync.Once{}}
ep = &endpoint{id: eps.Eid, network: n}
} else {
ep, err = n.getEndpointFromStore(eps.Eid)
if err != nil {
logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err)
ep = &endpoint{id: eps.Eid, network: n}
}
}
heap.Push(&sb.endpoints, ep)