Fixes a case of ungraceful daemon restart + unreachable store
For ungraceful daemon restarts, libnetwork has sandbox cleanup logic to remove any stale & dangling resources. But, if the store is down during the daemon restart, then the cleanup logic would not be able to perform complete cleanup. During such cases, the sandbox has been removed. With this fix, we retain the sandbox if the store is down and the endpoint couldnt be cleaned. When the container is later restarted in docker daemon, we will perform a sandbox cleanup and that will complete the cleanup round. Signed-off-by: Madhu Venugopal <madhu@docker.com>
This commit is contained in:
parent
99132ffb7f
commit
c8a66f5e72
3 changed files with 28 additions and 9 deletions
|
@ -118,6 +118,12 @@ func (d *driver) Leave(nid, eid string) error {
|
|||
return fmt.Errorf("could not find network with id %s", nid)
|
||||
}
|
||||
|
||||
ep := n.endpoint(eid)
|
||||
|
||||
if ep == nil {
|
||||
return types.InternalMaskableErrorf("could not find endpoint with id %s", eid)
|
||||
}
|
||||
|
||||
if d.notifyCh != nil {
|
||||
d.notifyCh <- ovNotify{
|
||||
action: "leave",
|
||||
|
|
|
@ -168,6 +168,7 @@ func (sb *sandbox) Delete() error {
|
|||
c := sb.controller
|
||||
|
||||
// Detach from all endpoints
|
||||
retain := false
|
||||
for _, ep := range sb.getConnectedEndpoints() {
|
||||
// endpoint in the Gateway network will be cleaned up
|
||||
// when when sandbox no longer needs external connectivity
|
||||
|
@ -176,14 +177,22 @@ func (sb *sandbox) Delete() error {
|
|||
}
|
||||
|
||||
if err := ep.Leave(sb); err != nil {
|
||||
retain = true
|
||||
log.Warnf("Failed detaching sandbox %s from endpoint %s: %v\n", sb.ID(), ep.ID(), err)
|
||||
}
|
||||
|
||||
if err := ep.Delete(); err != nil {
|
||||
retain = true
|
||||
log.Warnf("Failed deleting endpoint %s: %v\n", ep.ID(), err)
|
||||
}
|
||||
}
|
||||
|
||||
if retain {
|
||||
sb.Lock()
|
||||
sb.inDelete = false
|
||||
sb.Unlock()
|
||||
return fmt.Errorf("could not cleanup all the endpoints in container %s / sandbox %s", sb.containerID, sb.id)
|
||||
}
|
||||
// Container is going away. Path cache in etchosts is most
|
||||
// likely not required any more. Drop it.
|
||||
etchosts.Drop(sb.config.hostsPath)
|
||||
|
|
|
@ -3,6 +3,7 @@ package libnetwork
|
|||
import (
|
||||
"container/heap"
|
||||
"encoding/json"
|
||||
"sync"
|
||||
|
||||
"github.com/Sirupsen/logrus"
|
||||
"github.com/docker/libnetwork/datastore"
|
||||
|
@ -119,8 +120,9 @@ func (sbs *sbState) DataScope() string {
|
|||
|
||||
func (sb *sandbox) storeUpdate() error {
|
||||
sbs := &sbState{
|
||||
c: sb.controller,
|
||||
ID: sb.id,
|
||||
c: sb.controller,
|
||||
ID: sb.id,
|
||||
Cid: sb.containerID,
|
||||
}
|
||||
|
||||
retry:
|
||||
|
@ -197,15 +199,17 @@ func (c *controller) sandboxCleanup() {
|
|||
|
||||
for _, eps := range sbs.Eps {
|
||||
n, err := c.getNetworkFromStore(eps.Nid)
|
||||
var ep *endpoint
|
||||
if err != nil {
|
||||
logrus.Errorf("getNetworkFromStore for nid %s failed while trying to build sandbox for cleanup: %v", eps.Nid, err)
|
||||
continue
|
||||
}
|
||||
|
||||
ep, err := n.getEndpointFromStore(eps.Eid)
|
||||
if err != nil {
|
||||
logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err)
|
||||
continue
|
||||
n = &network{id: eps.Nid, ctrlr: c, drvOnce: &sync.Once{}}
|
||||
ep = &endpoint{id: eps.Eid, network: n}
|
||||
} else {
|
||||
ep, err = n.getEndpointFromStore(eps.Eid)
|
||||
if err != nil {
|
||||
logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err)
|
||||
ep = &endpoint{id: eps.Eid, network: n}
|
||||
}
|
||||
}
|
||||
|
||||
heap.Push(&sb.endpoints, ep)
|
||||
|
|
Loading…
Add table
Reference in a new issue