6ab3b50a3f
Before this patch:
INFO[2022-07-27T14:30:06.188762628Z] Starting up
INFO[2022-07-27T14:30:06.190750725Z] libcontainerd: started new containerd process pid=2028
...
WARN[0000] containerd config version `1` has been deprecated and will be removed in containerd v2.0, please switch to version `2`, see https://github.com/containerd/containerd/blob/main/docs/PLUGINS.md#version-header
INFO[2022-07-27T14:30:06.220024286Z] starting containerd revision=10c12954828e7c7c9b6e0ea9b0c02b01407d3ae1 version=v1.6.6
With this patch:
INFO[2022-07-27T14:28:04.025543517Z] Starting up
INFO[2022-07-27T14:28:04.027447105Z] libcontainerd: started new containerd process pid=1377
...
INFO[2022-07-27T14:28:04.054483270Z] starting containerd revision=10c12954828e7c7c9b6e0ea9b0c02b01407d3ae1 version=v1.6.6
And the generated /var/run/docker/containerd/containerd.toml:
```toml
disabled_plugins = ["io.containerd.grpc.v1.cri"]
imports = []
oom_score = 0
plugin_dir = ""
required_plugins = []
root = "/var/lib/docker/containerd/daemon"
state = "/var/run/docker/containerd/daemon"
temp = ""
version = 2
[cgroup]
path = ""
[debug]
address = "/var/run/docker/containerd/containerd-debug.sock"
format = ""
gid = 0
level = "debug"
uid = 0
[grpc]
address = "/var/run/docker/containerd/containerd.sock"
gid = 0
max_recv_message_size = 16777216
max_send_message_size = 16777216
tcp_address = ""
tcp_tls_ca = ""
tcp_tls_cert = ""
tcp_tls_key = ""
uid = 0
[metrics]
address = ""
grpc_histogram = false
[plugins]
[proxy_plugins]
[stream_processors]
[timeouts]
[ttrpc]
address = ""
gid = 0
uid = 0
```
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
(cherry picked from commit ba2ff69894
)
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
341 lines
7.6 KiB
Go
341 lines
7.6 KiB
Go
package supervisor // import "github.com/docker/docker/libcontainerd/supervisor"
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/containerd/containerd"
|
|
"github.com/containerd/containerd/services/server/config"
|
|
"github.com/docker/docker/pkg/system"
|
|
"github.com/pelletier/go-toml"
|
|
"github.com/pkg/errors"
|
|
"github.com/sirupsen/logrus"
|
|
)
|
|
|
|
const (
|
|
maxConnectionRetryCount = 3
|
|
healthCheckTimeout = 3 * time.Second
|
|
shutdownTimeout = 15 * time.Second
|
|
startupTimeout = 15 * time.Second
|
|
configFile = "containerd.toml"
|
|
binaryName = "containerd"
|
|
pidFile = "containerd.pid"
|
|
)
|
|
|
|
type remote struct {
|
|
sync.RWMutex
|
|
config.Config
|
|
// Plugins overrides `Plugins map[string]toml.Tree` in config config.
|
|
Plugins map[string]interface{} `toml:"plugins"`
|
|
|
|
daemonPid int
|
|
logger *logrus.Entry
|
|
|
|
daemonWaitCh chan struct{}
|
|
daemonStartCh chan error
|
|
daemonStopCh chan struct{}
|
|
|
|
rootDir string
|
|
stateDir string
|
|
}
|
|
|
|
// Daemon represents a running containerd daemon
|
|
type Daemon interface {
|
|
WaitTimeout(time.Duration) error
|
|
Address() string
|
|
}
|
|
|
|
// DaemonOpt allows to configure parameters of container daemons
|
|
type DaemonOpt func(c *remote) error
|
|
|
|
// Start starts a containerd daemon and monitors it
|
|
func Start(ctx context.Context, rootDir, stateDir string, opts ...DaemonOpt) (Daemon, error) {
|
|
r := &remote{
|
|
rootDir: rootDir,
|
|
stateDir: stateDir,
|
|
Config: config.Config{
|
|
Version: 2,
|
|
Root: filepath.Join(rootDir, "daemon"),
|
|
State: filepath.Join(stateDir, "daemon"),
|
|
},
|
|
Plugins: make(map[string]interface{}),
|
|
daemonPid: -1,
|
|
logger: logrus.WithField("module", "libcontainerd"),
|
|
daemonStartCh: make(chan error, 1),
|
|
daemonStopCh: make(chan struct{}),
|
|
}
|
|
|
|
for _, opt := range opts {
|
|
if err := opt(r); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
r.setDefaults()
|
|
|
|
if err := system.MkdirAll(stateDir, 0700); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
go r.monitorDaemon(ctx)
|
|
|
|
timeout := time.NewTimer(startupTimeout)
|
|
defer timeout.Stop()
|
|
|
|
select {
|
|
case <-timeout.C:
|
|
return nil, errors.New("timeout waiting for containerd to start")
|
|
case err := <-r.daemonStartCh:
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
return r, nil
|
|
}
|
|
func (r *remote) WaitTimeout(d time.Duration) error {
|
|
timeout := time.NewTimer(d)
|
|
defer timeout.Stop()
|
|
|
|
select {
|
|
case <-timeout.C:
|
|
return errors.New("timeout waiting for containerd to stop")
|
|
case <-r.daemonStopCh:
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (r *remote) Address() string {
|
|
return r.GRPC.Address
|
|
}
|
|
func (r *remote) getContainerdPid() (int, error) {
|
|
pidFile := filepath.Join(r.stateDir, pidFile)
|
|
f, err := os.OpenFile(pidFile, os.O_RDWR, 0600)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return -1, nil
|
|
}
|
|
return -1, err
|
|
}
|
|
defer f.Close()
|
|
|
|
b := make([]byte, 8)
|
|
n, err := f.Read(b)
|
|
if err != nil && err != io.EOF {
|
|
return -1, err
|
|
}
|
|
|
|
if n > 0 {
|
|
pid, err := strconv.ParseUint(string(b[:n]), 10, 64)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
if system.IsProcessAlive(int(pid)) {
|
|
return int(pid), nil
|
|
}
|
|
}
|
|
|
|
return -1, nil
|
|
}
|
|
|
|
func (r *remote) getContainerdConfig() (string, error) {
|
|
path := filepath.Join(r.stateDir, configFile)
|
|
f, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600)
|
|
if err != nil {
|
|
return "", errors.Wrapf(err, "failed to open containerd config file at %s", path)
|
|
}
|
|
defer f.Close()
|
|
|
|
if err := toml.NewEncoder(f).Encode(r); err != nil {
|
|
return "", errors.Wrapf(err, "failed to write containerd config file (%s)", path)
|
|
}
|
|
return path, nil
|
|
}
|
|
|
|
func (r *remote) startContainerd() error {
|
|
pid, err := r.getContainerdPid()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if pid != -1 {
|
|
r.daemonPid = pid
|
|
logrus.WithField("pid", pid).
|
|
Infof("libcontainerd: %s is still running", binaryName)
|
|
return nil
|
|
}
|
|
|
|
configFile, err := r.getContainerdConfig()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
args := []string{"--config", configFile}
|
|
|
|
if r.Debug.Level != "" {
|
|
args = append(args, "--log-level", r.Debug.Level)
|
|
}
|
|
|
|
cmd := exec.Command(binaryName, args...)
|
|
// redirect containerd logs to docker logs
|
|
cmd.Stdout = os.Stdout
|
|
cmd.Stderr = os.Stderr
|
|
cmd.SysProcAttr = containerdSysProcAttr()
|
|
// clear the NOTIFY_SOCKET from the env when starting containerd
|
|
cmd.Env = nil
|
|
for _, e := range os.Environ() {
|
|
if !strings.HasPrefix(e, "NOTIFY_SOCKET") {
|
|
cmd.Env = append(cmd.Env, e)
|
|
}
|
|
}
|
|
if err := cmd.Start(); err != nil {
|
|
return err
|
|
}
|
|
|
|
r.daemonWaitCh = make(chan struct{})
|
|
go func() {
|
|
// Reap our child when needed
|
|
if err := cmd.Wait(); err != nil {
|
|
r.logger.WithError(err).Errorf("containerd did not exit successfully")
|
|
}
|
|
close(r.daemonWaitCh)
|
|
}()
|
|
|
|
r.daemonPid = cmd.Process.Pid
|
|
|
|
err = os.WriteFile(filepath.Join(r.stateDir, pidFile), []byte(fmt.Sprintf("%d", r.daemonPid)), 0660)
|
|
if err != nil {
|
|
system.KillProcess(r.daemonPid)
|
|
return errors.Wrap(err, "libcontainerd: failed to save daemon pid to disk")
|
|
}
|
|
|
|
logrus.WithField("pid", r.daemonPid).
|
|
Infof("libcontainerd: started new %s process", binaryName)
|
|
|
|
return nil
|
|
}
|
|
|
|
func (r *remote) monitorDaemon(ctx context.Context) {
|
|
var (
|
|
transientFailureCount = 0
|
|
client *containerd.Client
|
|
err error
|
|
delay time.Duration
|
|
timer = time.NewTimer(0)
|
|
started bool
|
|
)
|
|
|
|
defer func() {
|
|
if r.daemonPid != -1 {
|
|
r.stopDaemon()
|
|
}
|
|
|
|
// cleanup some files
|
|
os.Remove(filepath.Join(r.stateDir, pidFile))
|
|
|
|
r.platformCleanup()
|
|
|
|
close(r.daemonStopCh)
|
|
timer.Stop()
|
|
}()
|
|
|
|
// ensure no races on sending to timer.C even though there is a 0 duration.
|
|
if !timer.Stop() {
|
|
<-timer.C
|
|
}
|
|
|
|
for {
|
|
timer.Reset(delay)
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
r.logger.Info("stopping healthcheck following graceful shutdown")
|
|
if client != nil {
|
|
client.Close()
|
|
}
|
|
return
|
|
case <-timer.C:
|
|
}
|
|
|
|
if r.daemonPid == -1 {
|
|
if r.daemonWaitCh != nil {
|
|
select {
|
|
case <-ctx.Done():
|
|
r.logger.Info("stopping containerd startup following graceful shutdown")
|
|
return
|
|
case <-r.daemonWaitCh:
|
|
}
|
|
}
|
|
|
|
os.RemoveAll(r.GRPC.Address)
|
|
if err := r.startContainerd(); err != nil {
|
|
if !started {
|
|
r.daemonStartCh <- err
|
|
return
|
|
}
|
|
r.logger.WithError(err).Error("failed restarting containerd")
|
|
delay = 50 * time.Millisecond
|
|
continue
|
|
}
|
|
|
|
client, err = containerd.New(r.GRPC.Address, containerd.WithTimeout(60*time.Second))
|
|
if err != nil {
|
|
r.logger.WithError(err).Error("failed connecting to containerd")
|
|
delay = 100 * time.Millisecond
|
|
continue
|
|
}
|
|
logrus.WithField("address", r.GRPC.Address).Debug("Created containerd monitoring client")
|
|
}
|
|
|
|
if client != nil {
|
|
tctx, cancel := context.WithTimeout(ctx, healthCheckTimeout)
|
|
_, err := client.IsServing(tctx)
|
|
cancel()
|
|
if err == nil {
|
|
if !started {
|
|
close(r.daemonStartCh)
|
|
started = true
|
|
}
|
|
|
|
transientFailureCount = 0
|
|
|
|
select {
|
|
case <-r.daemonWaitCh:
|
|
case <-ctx.Done():
|
|
}
|
|
|
|
// Set a small delay in case there is a recurring failure (or bug in this code)
|
|
// to ensure we don't end up in a super tight loop.
|
|
delay = 500 * time.Millisecond
|
|
continue
|
|
}
|
|
|
|
r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding")
|
|
|
|
transientFailureCount++
|
|
if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) {
|
|
delay = time.Duration(transientFailureCount) * 200 * time.Millisecond
|
|
continue
|
|
}
|
|
client.Close()
|
|
client = nil
|
|
}
|
|
|
|
if system.IsProcessAlive(r.daemonPid) {
|
|
r.logger.WithField("pid", r.daemonPid).Info("killing and restarting containerd")
|
|
r.killDaemon()
|
|
}
|
|
|
|
r.daemonPid = -1
|
|
delay = 0
|
|
transientFailureCount = 0
|
|
}
|
|
}
|