From c8a66f5e72d085d0f3550f70adde82ad3e9121b0 Mon Sep 17 00:00:00 2001 From: Madhu Venugopal Date: Thu, 29 Oct 2015 17:16:52 -0700 Subject: [PATCH] Fixes a case of ungraceful daemon restart + unreachable store For ungraceful daemon restarts, libnetwork has sandbox cleanup logic to remove any stale & dangling resources. But, if the store is down during the daemon restart, then the cleanup logic would not be able to perform complete cleanup. During such cases, the sandbox has been removed. With this fix, we retain the sandbox if the store is down and the endpoint couldnt be cleaned. When the container is later restarted in docker daemon, we will perform a sandbox cleanup and that will complete the cleanup round. Signed-off-by: Madhu Venugopal --- libnetwork/drivers/overlay/joinleave.go | 6 ++++++ libnetwork/sandbox.go | 9 +++++++++ libnetwork/sandbox_store.go | 22 +++++++++++++--------- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/libnetwork/drivers/overlay/joinleave.go b/libnetwork/drivers/overlay/joinleave.go index 4e844d4c45..997a3e77b7 100644 --- a/libnetwork/drivers/overlay/joinleave.go +++ b/libnetwork/drivers/overlay/joinleave.go @@ -118,6 +118,12 @@ func (d *driver) Leave(nid, eid string) error { return fmt.Errorf("could not find network with id %s", nid) } + ep := n.endpoint(eid) + + if ep == nil { + return types.InternalMaskableErrorf("could not find endpoint with id %s", eid) + } + if d.notifyCh != nil { d.notifyCh <- ovNotify{ action: "leave", diff --git a/libnetwork/sandbox.go b/libnetwork/sandbox.go index bb5097935c..5ea3f1f80f 100644 --- a/libnetwork/sandbox.go +++ b/libnetwork/sandbox.go @@ -168,6 +168,7 @@ func (sb *sandbox) Delete() error { c := sb.controller // Detach from all endpoints + retain := false for _, ep := range sb.getConnectedEndpoints() { // endpoint in the Gateway network will be cleaned up // when when sandbox no longer needs external connectivity @@ -176,14 +177,22 @@ func (sb *sandbox) Delete() error { } if err := ep.Leave(sb); err != nil { + retain = true log.Warnf("Failed detaching sandbox %s from endpoint %s: %v\n", sb.ID(), ep.ID(), err) } if err := ep.Delete(); err != nil { + retain = true log.Warnf("Failed deleting endpoint %s: %v\n", ep.ID(), err) } } + if retain { + sb.Lock() + sb.inDelete = false + sb.Unlock() + return fmt.Errorf("could not cleanup all the endpoints in container %s / sandbox %s", sb.containerID, sb.id) + } // Container is going away. Path cache in etchosts is most // likely not required any more. Drop it. etchosts.Drop(sb.config.hostsPath) diff --git a/libnetwork/sandbox_store.go b/libnetwork/sandbox_store.go index 0b8f0c95e3..4844032b2b 100644 --- a/libnetwork/sandbox_store.go +++ b/libnetwork/sandbox_store.go @@ -3,6 +3,7 @@ package libnetwork import ( "container/heap" "encoding/json" + "sync" "github.com/Sirupsen/logrus" "github.com/docker/libnetwork/datastore" @@ -119,8 +120,9 @@ func (sbs *sbState) DataScope() string { func (sb *sandbox) storeUpdate() error { sbs := &sbState{ - c: sb.controller, - ID: sb.id, + c: sb.controller, + ID: sb.id, + Cid: sb.containerID, } retry: @@ -197,15 +199,17 @@ func (c *controller) sandboxCleanup() { for _, eps := range sbs.Eps { n, err := c.getNetworkFromStore(eps.Nid) + var ep *endpoint if err != nil { logrus.Errorf("getNetworkFromStore for nid %s failed while trying to build sandbox for cleanup: %v", eps.Nid, err) - continue - } - - ep, err := n.getEndpointFromStore(eps.Eid) - if err != nil { - logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err) - continue + n = &network{id: eps.Nid, ctrlr: c, drvOnce: &sync.Once{}} + ep = &endpoint{id: eps.Eid, network: n} + } else { + ep, err = n.getEndpointFromStore(eps.Eid) + if err != nil { + logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err) + ep = &endpoint{id: eps.Eid, network: n} + } } heap.Push(&sb.endpoints, ep)