Browse Source

Fix when containerd restarted, event handler may exit

Description:
Kill docker-containerd continuously, and use kill -SIGUSR1 <dockerpid>
to check docker callstacks. And we will find that event
handler: startEventsMonitor or handleEventStream will exit.

This will only happen when system is busy, containerd need more time to
startup, and the monitor gorotine maybe exit.

Signed-off-by: Wentao Zhang <zhangwentao234@huawei.com>
Wentao Zhang 8 years ago
parent
commit
02ce73f62e
1 changed files with 20 additions and 7 deletions
  1. 20 7
      libcontainerd/remote_unix.go

+ 20 - 7
libcontainerd/remote_unix.go

@@ -49,7 +49,7 @@ type remote struct {
 	stateDir             string
 	stateDir             string
 	rpcAddr              string
 	rpcAddr              string
 	startDaemon          bool
 	startDaemon          bool
-	closeManually        bool
+	closedManually       bool
 	debugLog             bool
 	debugLog             bool
 	rpcConn              *grpc.ClientConn
 	rpcConn              *grpc.ClientConn
 	clients              []*client
 	clients              []*client
@@ -154,7 +154,7 @@ func (r *remote) handleConnectionChange() {
 		logrus.Debugf("libcontainerd: containerd health check returned error: %v", err)
 		logrus.Debugf("libcontainerd: containerd health check returned error: %v", err)
 
 
 		if r.daemonPid != -1 {
 		if r.daemonPid != -1 {
-			if r.closeManually {
+			if r.closedManually {
 				// Well, we asked for it to stop, just return
 				// Well, we asked for it to stop, just return
 				return
 				return
 			}
 			}
@@ -180,7 +180,7 @@ func (r *remote) Cleanup() {
 	if r.daemonPid == -1 {
 	if r.daemonPid == -1 {
 		return
 		return
 	}
 	}
-	r.closeManually = true
+	r.closedManually = true
 	r.rpcConn.Close()
 	r.rpcConn.Close()
 	// Ask the daemon to quit
 	// Ask the daemon to quit
 	syscall.Kill(r.daemonPid, syscall.SIGTERM)
 	syscall.Kill(r.daemonPid, syscall.SIGTERM)
@@ -280,10 +280,23 @@ func (r *remote) startEventsMonitor() error {
 	er := &containerd.EventsRequest{
 	er := &containerd.EventsRequest{
 		Timestamp: tsp,
 		Timestamp: tsp,
 	}
 	}
-	events, err := r.apiClient.Events(context.Background(), er, grpc.FailFast(false))
-	if err != nil {
-		return err
+
+	var events containerd.API_EventsClient
+	for {
+		events, err = r.apiClient.Events(context.Background(), er, grpc.FailFast(false))
+		if err == nil {
+			break
+		}
+		logrus.Warnf("libcontainerd: failed to get events from containerd: %q", err)
+
+		if r.closedManually {
+			// ignore error if grpc remote connection is closed manually
+			return nil
+		}
+
+		<-time.After(100 * time.Millisecond)
 	}
 	}
+
 	go r.handleEventStream(events)
 	go r.handleEventStream(events)
 	return nil
 	return nil
 }
 }
@@ -293,7 +306,7 @@ func (r *remote) handleEventStream(events containerd.API_EventsClient) {
 		e, err := events.Recv()
 		e, err := events.Recv()
 		if err != nil {
 		if err != nil {
 			if grpc.ErrorDesc(err) == transport.ErrConnClosing.Desc &&
 			if grpc.ErrorDesc(err) == transport.ErrConnClosing.Desc &&
-				r.closeManually {
+				r.closedManually {
 				// ignore error if grpc remote connection is closed manually
 				// ignore error if grpc remote connection is closed manually
 				return
 				return
 			}
 			}