Quellcode durchsuchen

Fixes a case of ungraceful daemon restart + unreachable store

For ungraceful daemon restarts, libnetwork has sandbox cleanup logic to
remove any stale & dangling resources. But, if the store is down during
the daemon restart, then the cleanup logic would not be able to perform
complete cleanup. During such cases, the sandbox has been removed. With
this fix, we retain the sandbox if the store is down and the endpoint
couldnt be cleaned. When the container is later restarted in docker
daemon, we will perform a sandbox cleanup and that will complete the
cleanup round.

Signed-off-by: Madhu Venugopal <madhu@docker.com>
Madhu Venugopal vor 9 Jahren
Ursprung
Commit
c8a66f5e72
3 geänderte Dateien mit 28 neuen und 9 gelöschten Zeilen
  1. 6 0
      libnetwork/drivers/overlay/joinleave.go
  2. 9 0
      libnetwork/sandbox.go
  3. 13 9
      libnetwork/sandbox_store.go

+ 6 - 0
libnetwork/drivers/overlay/joinleave.go

@@ -118,6 +118,12 @@ func (d *driver) Leave(nid, eid string) error {
 		return fmt.Errorf("could not find network with id %s", nid)
 	}
 
+	ep := n.endpoint(eid)
+
+	if ep == nil {
+		return types.InternalMaskableErrorf("could not find endpoint with id %s", eid)
+	}
+
 	if d.notifyCh != nil {
 		d.notifyCh <- ovNotify{
 			action: "leave",

+ 9 - 0
libnetwork/sandbox.go

@@ -168,6 +168,7 @@ func (sb *sandbox) Delete() error {
 	c := sb.controller
 
 	// Detach from all endpoints
+	retain := false
 	for _, ep := range sb.getConnectedEndpoints() {
 		// endpoint in the Gateway network will be cleaned up
 		// when when sandbox no longer needs external connectivity
@@ -176,14 +177,22 @@ func (sb *sandbox) Delete() error {
 		}
 
 		if err := ep.Leave(sb); err != nil {
+			retain = true
 			log.Warnf("Failed detaching sandbox %s from endpoint %s: %v\n", sb.ID(), ep.ID(), err)
 		}
 
 		if err := ep.Delete(); err != nil {
+			retain = true
 			log.Warnf("Failed deleting endpoint %s: %v\n", ep.ID(), err)
 		}
 	}
 
+	if retain {
+		sb.Lock()
+		sb.inDelete = false
+		sb.Unlock()
+		return fmt.Errorf("could not cleanup all the endpoints in container %s / sandbox %s", sb.containerID, sb.id)
+	}
 	// Container is going away. Path cache in etchosts is most
 	// likely not required any more. Drop it.
 	etchosts.Drop(sb.config.hostsPath)

+ 13 - 9
libnetwork/sandbox_store.go

@@ -3,6 +3,7 @@ package libnetwork
 import (
 	"container/heap"
 	"encoding/json"
+	"sync"
 
 	"github.com/Sirupsen/logrus"
 	"github.com/docker/libnetwork/datastore"
@@ -119,8 +120,9 @@ func (sbs *sbState) DataScope() string {
 
 func (sb *sandbox) storeUpdate() error {
 	sbs := &sbState{
-		c:  sb.controller,
-		ID: sb.id,
+		c:   sb.controller,
+		ID:  sb.id,
+		Cid: sb.containerID,
 	}
 
 retry:
@@ -197,15 +199,17 @@ func (c *controller) sandboxCleanup() {
 
 		for _, eps := range sbs.Eps {
 			n, err := c.getNetworkFromStore(eps.Nid)
+			var ep *endpoint
 			if err != nil {
 				logrus.Errorf("getNetworkFromStore for nid %s failed while trying to build sandbox for cleanup: %v", eps.Nid, err)
-				continue
-			}
-
-			ep, err := n.getEndpointFromStore(eps.Eid)
-			if err != nil {
-				logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err)
-				continue
+				n = &network{id: eps.Nid, ctrlr: c, drvOnce: &sync.Once{}}
+				ep = &endpoint{id: eps.Eid, network: n}
+			} else {
+				ep, err = n.getEndpointFromStore(eps.Eid)
+				if err != nil {
+					logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err)
+					ep = &endpoint{id: eps.Eid, network: n}
+				}
 			}
 
 			heap.Push(&sb.endpoints, ep)