浏览代码

Fixes a case of ungraceful daemon restart + unreachable store

For ungraceful daemon restarts, libnetwork has sandbox cleanup logic to
remove any stale & dangling resources. But, if the store is down during
the daemon restart, then the cleanup logic would not be able to perform
complete cleanup. During such cases, the sandbox has been removed. With
this fix, we retain the sandbox if the store is down and the endpoint
couldnt be cleaned. When the container is later restarted in docker
daemon, we will perform a sandbox cleanup and that will complete the
cleanup round.

Signed-off-by: Madhu Venugopal <madhu@docker.com>
Madhu Venugopal 9 年之前
父节点
当前提交
c8a66f5e72
共有 3 个文件被更改,包括 28 次插入9 次删除
  1. 6 0
      libnetwork/drivers/overlay/joinleave.go
  2. 9 0
      libnetwork/sandbox.go
  3. 13 9
      libnetwork/sandbox_store.go

+ 6 - 0
libnetwork/drivers/overlay/joinleave.go

@@ -118,6 +118,12 @@ func (d *driver) Leave(nid, eid string) error {
 		return fmt.Errorf("could not find network with id %s", nid)
 	}
 
+	ep := n.endpoint(eid)
+
+	if ep == nil {
+		return types.InternalMaskableErrorf("could not find endpoint with id %s", eid)
+	}
+
 	if d.notifyCh != nil {
 		d.notifyCh <- ovNotify{
 			action: "leave",

+ 9 - 0
libnetwork/sandbox.go

@@ -168,6 +168,7 @@ func (sb *sandbox) Delete() error {
 	c := sb.controller
 
 	// Detach from all endpoints
+	retain := false
 	for _, ep := range sb.getConnectedEndpoints() {
 		// endpoint in the Gateway network will be cleaned up
 		// when when sandbox no longer needs external connectivity
@@ -176,14 +177,22 @@ func (sb *sandbox) Delete() error {
 		}
 
 		if err := ep.Leave(sb); err != nil {
+			retain = true
 			log.Warnf("Failed detaching sandbox %s from endpoint %s: %v\n", sb.ID(), ep.ID(), err)
 		}
 
 		if err := ep.Delete(); err != nil {
+			retain = true
 			log.Warnf("Failed deleting endpoint %s: %v\n", ep.ID(), err)
 		}
 	}
 
+	if retain {
+		sb.Lock()
+		sb.inDelete = false
+		sb.Unlock()
+		return fmt.Errorf("could not cleanup all the endpoints in container %s / sandbox %s", sb.containerID, sb.id)
+	}
 	// Container is going away. Path cache in etchosts is most
 	// likely not required any more. Drop it.
 	etchosts.Drop(sb.config.hostsPath)

+ 13 - 9
libnetwork/sandbox_store.go

@@ -3,6 +3,7 @@ package libnetwork
 import (
 	"container/heap"
 	"encoding/json"
+	"sync"
 
 	"github.com/Sirupsen/logrus"
 	"github.com/docker/libnetwork/datastore"
@@ -119,8 +120,9 @@ func (sbs *sbState) DataScope() string {
 
 func (sb *sandbox) storeUpdate() error {
 	sbs := &sbState{
-		c:  sb.controller,
-		ID: sb.id,
+		c:   sb.controller,
+		ID:  sb.id,
+		Cid: sb.containerID,
 	}
 
 retry:
@@ -197,15 +199,17 @@ func (c *controller) sandboxCleanup() {
 
 		for _, eps := range sbs.Eps {
 			n, err := c.getNetworkFromStore(eps.Nid)
+			var ep *endpoint
 			if err != nil {
 				logrus.Errorf("getNetworkFromStore for nid %s failed while trying to build sandbox for cleanup: %v", eps.Nid, err)
-				continue
-			}
-
-			ep, err := n.getEndpointFromStore(eps.Eid)
-			if err != nil {
-				logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err)
-				continue
+				n = &network{id: eps.Nid, ctrlr: c, drvOnce: &sync.Once{}}
+				ep = &endpoint{id: eps.Eid, network: n}
+			} else {
+				ep, err = n.getEndpointFromStore(eps.Eid)
+				if err != nil {
+					logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err)
+					ep = &endpoint{id: eps.Eid, network: n}
+				}
 			}
 
 			heap.Push(&sb.endpoints, ep)