moby/libcontainerd/supervisor/remote_daemon.go

333 lines
7.4 KiB
Go
Raw Normal View History

package supervisor // import "github.com/docker/docker/libcontainerd/supervisor"
import (
"context"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"time"
"github.com/containerd/containerd"
"github.com/containerd/containerd/services/server/config"
"github.com/docker/docker/pkg/system"
"github.com/pelletier/go-toml"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
const (
maxConnectionRetryCount = 3
healthCheckTimeout = 3 * time.Second
shutdownTimeout = 15 * time.Second
startupTimeout = 15 * time.Second
configFile = "containerd.toml"
binaryName = "containerd"
pidFile = "containerd.pid"
)
type remote struct {
config.Config
daemonPid int
logger *logrus.Entry
daemonWaitCh chan struct{}
daemonStartCh chan error
daemonStopCh chan struct{}
stateDir string
}
// Daemon represents a running containerd daemon
type Daemon interface {
WaitTimeout(time.Duration) error
Address() string
}
// DaemonOpt allows to configure parameters of container daemons
type DaemonOpt func(c *remote) error
// Start starts a containerd daemon and monitors it
func Start(ctx context.Context, rootDir, stateDir string, opts ...DaemonOpt) (Daemon, error) {
r := &remote{
stateDir: stateDir,
Config: config.Config{
libcontainerd: switch generated containerd.toml to v2 (v1 is deprecated) Before this patch: INFO[2022-07-27T14:30:06.188762628Z] Starting up INFO[2022-07-27T14:30:06.190750725Z] libcontainerd: started new containerd process pid=2028 ... WARN[0000] containerd config version `1` has been deprecated and will be removed in containerd v2.0, please switch to version `2`, see https://github.com/containerd/containerd/blob/main/docs/PLUGINS.md#version-header INFO[2022-07-27T14:30:06.220024286Z] starting containerd revision=10c12954828e7c7c9b6e0ea9b0c02b01407d3ae1 version=v1.6.6 With this patch: INFO[2022-07-27T14:28:04.025543517Z] Starting up INFO[2022-07-27T14:28:04.027447105Z] libcontainerd: started new containerd process pid=1377 ... INFO[2022-07-27T14:28:04.054483270Z] starting containerd revision=10c12954828e7c7c9b6e0ea9b0c02b01407d3ae1 version=v1.6.6 And the generated /var/run/docker/containerd/containerd.toml: ```toml disabled_plugins = ["io.containerd.grpc.v1.cri"] imports = [] oom_score = 0 plugin_dir = "" required_plugins = [] root = "/var/lib/docker/containerd/daemon" state = "/var/run/docker/containerd/daemon" temp = "" version = 2 [cgroup] path = "" [debug] address = "/var/run/docker/containerd/containerd-debug.sock" format = "" gid = 0 level = "debug" uid = 0 [grpc] address = "/var/run/docker/containerd/containerd.sock" gid = 0 max_recv_message_size = 16777216 max_send_message_size = 16777216 tcp_address = "" tcp_tls_ca = "" tcp_tls_cert = "" tcp_tls_key = "" uid = 0 [metrics] address = "" grpc_histogram = false [plugins] [proxy_plugins] [stream_processors] [timeouts] [ttrpc] address = "" gid = 0 uid = 0 ``` Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-07-27 14:33:00 +00:00
Version: 2,
Root: filepath.Join(rootDir, "daemon"),
State: filepath.Join(stateDir, "daemon"),
},
daemonPid: -1,
logger: logrus.WithField("module", "libcontainerd"),
daemonStartCh: make(chan error, 1),
daemonStopCh: make(chan struct{}),
}
for _, opt := range opts {
if err := opt(r); err != nil {
return nil, err
}
}
r.setDefaults()
if err := system.MkdirAll(stateDir, 0700); err != nil {
return nil, err
}
go r.monitorDaemon(ctx)
timeout := time.NewTimer(startupTimeout)
defer timeout.Stop()
select {
case <-timeout.C:
return nil, errors.New("timeout waiting for containerd to start")
case err := <-r.daemonStartCh:
if err != nil {
return nil, err
}
}
return r, nil
}
func (r *remote) WaitTimeout(d time.Duration) error {
timeout := time.NewTimer(d)
defer timeout.Stop()
select {
case <-timeout.C:
return errors.New("timeout waiting for containerd to stop")
case <-r.daemonStopCh:
}
return nil
}
func (r *remote) Address() string {
return r.GRPC.Address
}
func (r *remote) getContainerdPid() (int, error) {
pidFile := filepath.Join(r.stateDir, pidFile)
f, err := os.OpenFile(pidFile, os.O_RDWR, 0600)
if err != nil {
if os.IsNotExist(err) {
return -1, nil
}
return -1, err
}
defer f.Close()
b := make([]byte, 8)
n, err := f.Read(b)
if err != nil && err != io.EOF {
return -1, err
}
if n > 0 {
pid, err := strconv.ParseUint(string(b[:n]), 10, 64)
if err != nil {
return -1, err
}
if system.IsProcessAlive(int(pid)) {
return int(pid), nil
}
}
return -1, nil
}
func (r *remote) getContainerdConfig() (string, error) {
path := filepath.Join(r.stateDir, configFile)
f, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600)
if err != nil {
return "", errors.Wrapf(err, "failed to open containerd config file at %s", path)
}
defer f.Close()
if err := toml.NewEncoder(f).Encode(r); err != nil {
return "", errors.Wrapf(err, "failed to write containerd config file (%s)", path)
}
return path, nil
}
func (r *remote) startContainerd() error {
pid, err := r.getContainerdPid()
if err != nil {
return err
}
if pid != -1 {
r.daemonPid = pid
r.logger.WithField("pid", pid).Infof("%s is still running", binaryName)
return nil
}
configFile, err := r.getContainerdConfig()
if err != nil {
return err
}
args := []string{"--config", configFile}
Pass log-level to containerd dockerd allows the `--log-level` to be specified, but this log-level was not forwarded to the containerd process. This patch sets containerd's log-level to the same as dockerd if a custom level is provided. Now that `--log-level` is also passed to containerd, the default "info" is removed, so that containerd's default (or the level configured in containerd.toml) is still used if no log-level is set. Before this change: containerd would always be started without a log-level set (only the level that's configured in `containerd.toml`); ``` root 1014 2.5 2.1 496484 43468 pts/0 Sl+ 12:23 0:00 dockerd root 1023 1.2 1.1 681768 23832 ? Ssl 12:23 0:00 \_ docker-containerd --config /var/run/docker/containerd/containerd.toml ``` After this change: when running `dockerd` without options (same as current); ``` root 1014 2.5 2.1 496484 43468 pts/0 Sl+ 12:23 0:00 dockerd root 1023 1.2 1.1 681768 23832 ? Ssl 12:23 0:00 \_ docker-containerd --config /var/run/docker/containerd/containerd.toml ``` when running `dockerd --debug`: ``` root 600 0.8 2.1 512876 43180 pts/0 Sl+ 12:20 0:00 dockerd --debug root 608 0.6 1.1 624428 23672 ? Ssl 12:20 0:00 \_ docker-containerd --config /var/run/docker/containerd/containerd.toml --log-level debug ``` when running `dockerd --log-level=panic` ``` root 747 0.6 2.1 496548 43996 pts/0 Sl+ 12:21 0:00 dockerd --log-level=panic root 755 0.7 1.1 550696 24100 ? Ssl 12:21 0:00 \_ docker-containerd --config /var/run/docker/containerd/containerd.toml --log-level panic ``` combining `--debug` and `--log-level` (`--debug` takes precedence): ``` root 880 2.7 2.1 634692 43336 pts/0 Sl+ 12:23 0:00 dockerd --debug --log-level=panic root 888 1.0 1.1 616232 23652 ? Ssl 12:23 0:00 \_ docker-containerd --config /var/run/docker/containerd/containerd.toml --log-level debug ``` Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2018-07-09 12:16:35 +00:00
if r.Debug.Level != "" {
args = append(args, "--log-level", r.Debug.Level)
}
cmd := exec.Command(binaryName, args...)
// redirect containerd logs to docker logs
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
cmd.SysProcAttr = containerdSysProcAttr()
// clear the NOTIFY_SOCKET from the env when starting containerd
cmd.Env = nil
for _, e := range os.Environ() {
if !strings.HasPrefix(e, "NOTIFY_SOCKET") {
cmd.Env = append(cmd.Env, e)
}
}
if err := cmd.Start(); err != nil {
return err
}
r.daemonWaitCh = make(chan struct{})
go func() {
// Reap our child when needed
if err := cmd.Wait(); err != nil {
r.logger.WithError(err).Errorf("containerd did not exit successfully")
}
close(r.daemonWaitCh)
}()
r.daemonPid = cmd.Process.Pid
err = os.WriteFile(filepath.Join(r.stateDir, pidFile), []byte(fmt.Sprintf("%d", r.daemonPid)), 0660)
if err != nil {
system.KillProcess(r.daemonPid)
return errors.Wrap(err, "libcontainerd: failed to save daemon pid to disk")
}
r.logger.WithField("pid", r.daemonPid).Infof("started new %s process", binaryName)
return nil
}
func (r *remote) monitorDaemon(ctx context.Context) {
var (
transientFailureCount = 0
client *containerd.Client
err error
delay time.Duration
timer = time.NewTimer(0)
started bool
)
defer func() {
if r.daemonPid != -1 {
r.stopDaemon()
}
// cleanup some files
os.Remove(filepath.Join(r.stateDir, pidFile))
r.platformCleanup()
close(r.daemonStopCh)
timer.Stop()
}()
// ensure no races on sending to timer.C even though there is a 0 duration.
if !timer.Stop() {
<-timer.C
}
for {
timer.Reset(delay)
select {
case <-ctx.Done():
r.logger.Info("stopping healthcheck following graceful shutdown")
if client != nil {
client.Close()
}
return
case <-timer.C:
}
if r.daemonPid == -1 {
if r.daemonWaitCh != nil {
select {
case <-ctx.Done():
r.logger.Info("stopping containerd startup following graceful shutdown")
return
case <-r.daemonWaitCh:
}
}
os.RemoveAll(r.GRPC.Address)
if err := r.startContainerd(); err != nil {
if !started {
r.daemonStartCh <- err
return
}
r.logger.WithError(err).Error("failed restarting containerd")
delay = 50 * time.Millisecond
continue
}
client, err = containerd.New(r.GRPC.Address, containerd.WithTimeout(60*time.Second))
if err != nil {
r.logger.WithError(err).Error("failed connecting to containerd")
delay = 100 * time.Millisecond
continue
}
r.logger.WithField("address", r.GRPC.Address).Debug("created containerd monitoring client")
}
if client != nil {
tctx, cancel := context.WithTimeout(ctx, healthCheckTimeout)
_, err := client.IsServing(tctx)
cancel()
if err == nil {
if !started {
close(r.daemonStartCh)
started = true
}
transientFailureCount = 0
select {
case <-r.daemonWaitCh:
case <-ctx.Done():
}
// Set a small delay in case there is a recurring failure (or bug in this code)
// to ensure we don't end up in a super tight loop.
delay = 500 * time.Millisecond
continue
}
r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding")
transientFailureCount++
if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) {
delay = time.Duration(transientFailureCount) * 200 * time.Millisecond
continue
}
client.Close()
client = nil
}
if system.IsProcessAlive(r.daemonPid) {
r.logger.WithField("pid", r.daemonPid).Info("killing and restarting containerd")
r.killDaemon()
}
r.daemonPid = -1
delay = 0
transientFailureCount = 0
}
}