From 9ed0504592d338890a37e18999f98d69d7103f2d Mon Sep 17 00:00:00 2001 From: Wei Fu Date: Thu, 24 Oct 2019 18:09:56 +0800 Subject: [PATCH] daemon: add grpc.WithBlock option WithBlock makes sure that the following containerd request is reliable. In one edge case with high load pressure, kernel kills dockerd, containerd and containerd-shims caused by OOM. When both dockerd and containerd restart, but containerd will take time to recover all the existing containers. Before containerd serving, dockerd will failed with gRPC error. That bad thing is that restore action will still ignore the any non-NotFound errors and returns running state for already stopped container. It is unexpected behavior. And we need to restart dockerd to make sure that anything is OK. It is painful. Add WithBlock can prevent the edge case. And n common case, the containerd will be serving in shortly. It is not harm to add WithBlock for containerd connection. Signed-off-by: Wei Fu (cherry picked from commit 9f73396dabf087a8dd5fa74296c2cd4c188ff889) Signed-off-by: Wei Fu --- daemon/daemon.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/daemon/daemon.go b/daemon/daemon.go index 014ec970a9..acc619a6af 100644 --- a/daemon/daemon.go +++ b/daemon/daemon.go @@ -866,6 +866,24 @@ func NewDaemon(ctx context.Context, config *config.Config, pluginStore *plugin.S registerMetricsPluginCallback(d.PluginStore, metricsSockPath) gopts := []grpc.DialOption{ + // WithBlock makes sure that the following containerd request + // is reliable. + // + // NOTE: In one edge case with high load pressure, kernel kills + // dockerd, containerd and containerd-shims caused by OOM. + // When both dockerd and containerd restart, but containerd + // will take time to recover all the existing containers. Before + // containerd serving, dockerd will failed with gRPC error. + // That bad thing is that restore action will still ignore the + // any non-NotFound errors and returns running state for + // already stopped container. It is unexpected behavior. And + // we need to restart dockerd to make sure that anything is OK. + // + // It is painful. Add WithBlock can prevent the edge case. And + // n common case, the containerd will be serving in shortly. + // It is not harm to add WithBlock for containerd connection. + grpc.WithBlock(), + grpc.WithInsecure(), grpc.WithBackoffMaxDelay(3 * time.Second), grpc.WithDialer(dialer.Dialer),