b6c7becbfe
This PR adds support for user-defined health-check probes for Docker containers. It adds a `HEALTHCHECK` instruction to the Dockerfile syntax plus some corresponding "docker run" options. It can be used with a restart policy to automatically restart a container if the check fails. The `HEALTHCHECK` instruction has two forms: * `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container) * `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image) The `HEALTHCHECK` instruction tells Docker how to test a container to check that it is still working. This can detect cases such as a web server that is stuck in an infinite loop and unable to handle new connections, even though the server process is still running. When a container has a healthcheck specified, it has a _health status_ in addition to its normal status. This status is initially `starting`. Whenever a health check passes, it becomes `healthy` (whatever state it was previously in). After a certain number of consecutive failures, it becomes `unhealthy`. The options that can appear before `CMD` are: * `--interval=DURATION` (default: `30s`) * `--timeout=DURATION` (default: `30s`) * `--retries=N` (default: `1`) The health check will first run **interval** seconds after the container is started, and then again **interval** seconds after each previous check completes. If a single run of the check takes longer than **timeout** seconds then the check is considered to have failed. It takes **retries** consecutive failures of the health check for the container to be considered `unhealthy`. There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list more than one then only the last `HEALTHCHECK` will take effect. The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands; see e.g. `ENTRYPOINT` for details). The command's exit status indicates the health status of the container. The possible values are: - 0: success - the container is healthy and ready for use - 1: unhealthy - the container is not working correctly - 2: starting - the container is not ready for use yet, but is working correctly If the probe returns 2 ("starting") when the container has already moved out of the "starting" state then it is treated as "unhealthy" instead. For example, to check every five minutes or so that a web-server is able to serve the site's main page within three seconds: HEALTHCHECK --interval=5m --timeout=3s \ CMD curl -f http://localhost/ || exit 1 To help debug failing probes, any output text (UTF-8 encoded) that the command writes on stdout or stderr will be stored in the health status and can be queried with `docker inspect`. Such output should be kept short (only the first 4096 bytes are stored currently). When the health status of a container changes, a `health_status` event is generated with the new status. The health status is also displayed in the `docker ps` output. Signed-off-by: Thomas Leonard <thomas.leonard@docker.com> Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
156 lines
4.1 KiB
Go
156 lines
4.1 KiB
Go
package daemon
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"runtime"
|
|
"strconv"
|
|
|
|
"github.com/Sirupsen/logrus"
|
|
"github.com/docker/docker/libcontainerd"
|
|
"github.com/docker/docker/runconfig"
|
|
)
|
|
|
|
// StateChanged updates daemon state changes from containerd
|
|
func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
|
|
c := daemon.containers.Get(id)
|
|
if c == nil {
|
|
return fmt.Errorf("no such container: %s", id)
|
|
}
|
|
|
|
switch e.State {
|
|
case libcontainerd.StateOOM:
|
|
// StateOOM is Linux specific and should never be hit on Windows
|
|
if runtime.GOOS == "windows" {
|
|
return errors.New("Received StateOOM from libcontainerd on Windows. This should never happen.")
|
|
}
|
|
daemon.updateHealthMonitor(c)
|
|
daemon.LogContainerEvent(c, "oom")
|
|
case libcontainerd.StateExit:
|
|
c.Lock()
|
|
defer c.Unlock()
|
|
c.Wait()
|
|
c.Reset(false)
|
|
c.SetStopped(platformConstructExitStatus(e))
|
|
attributes := map[string]string{
|
|
"exitCode": strconv.Itoa(int(e.ExitCode)),
|
|
}
|
|
daemon.updateHealthMonitor(c)
|
|
daemon.LogContainerEventWithAttributes(c, "die", attributes)
|
|
daemon.Cleanup(c)
|
|
// FIXME: here is race condition between two RUN instructions in Dockerfile
|
|
// because they share same runconfig and change image. Must be fixed
|
|
// in builder/builder.go
|
|
if err := c.ToDisk(); err != nil {
|
|
return err
|
|
}
|
|
return daemon.postRunProcessing(c, e)
|
|
case libcontainerd.StateRestart:
|
|
c.Lock()
|
|
defer c.Unlock()
|
|
c.Reset(false)
|
|
c.RestartCount++
|
|
c.SetRestarting(platformConstructExitStatus(e))
|
|
attributes := map[string]string{
|
|
"exitCode": strconv.Itoa(int(e.ExitCode)),
|
|
}
|
|
daemon.LogContainerEventWithAttributes(c, "die", attributes)
|
|
daemon.updateHealthMonitor(c)
|
|
return c.ToDisk()
|
|
case libcontainerd.StateExitProcess:
|
|
c.Lock()
|
|
defer c.Unlock()
|
|
if execConfig := c.ExecCommands.Get(e.ProcessID); execConfig != nil {
|
|
ec := int(e.ExitCode)
|
|
execConfig.ExitCode = &ec
|
|
execConfig.Running = false
|
|
execConfig.Wait()
|
|
if err := execConfig.CloseStreams(); err != nil {
|
|
logrus.Errorf("%s: %s", c.ID, err)
|
|
}
|
|
|
|
// remove the exec command from the container's store only and not the
|
|
// daemon's store so that the exec command can be inspected.
|
|
c.ExecCommands.Delete(execConfig.ID)
|
|
} else {
|
|
logrus.Warnf("Ignoring StateExitProcess for %v but no exec command found", e)
|
|
}
|
|
case libcontainerd.StateStart, libcontainerd.StateRestore:
|
|
// Container is already locked in this case
|
|
c.SetRunning(int(e.Pid), e.State == libcontainerd.StateStart)
|
|
c.HasBeenManuallyStopped = false
|
|
if err := c.ToDisk(); err != nil {
|
|
c.Reset(false)
|
|
return err
|
|
}
|
|
daemon.initHealthMonitor(c)
|
|
daemon.LogContainerEvent(c, "start")
|
|
case libcontainerd.StatePause:
|
|
// Container is already locked in this case
|
|
c.Paused = true
|
|
daemon.updateHealthMonitor(c)
|
|
daemon.LogContainerEvent(c, "pause")
|
|
case libcontainerd.StateResume:
|
|
// Container is already locked in this case
|
|
c.Paused = false
|
|
daemon.updateHealthMonitor(c)
|
|
daemon.LogContainerEvent(c, "unpause")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// AttachStreams is called by libcontainerd to connect the stdio.
|
|
func (daemon *Daemon) AttachStreams(id string, iop libcontainerd.IOPipe) error {
|
|
var s *runconfig.StreamConfig
|
|
c := daemon.containers.Get(id)
|
|
if c == nil {
|
|
ec, err := daemon.getExecConfig(id)
|
|
if err != nil {
|
|
return fmt.Errorf("no such exec/container: %s", id)
|
|
}
|
|
s = ec.StreamConfig
|
|
} else {
|
|
s = c.StreamConfig
|
|
if err := daemon.StartLogging(c); err != nil {
|
|
c.Reset(false)
|
|
return err
|
|
}
|
|
}
|
|
|
|
if stdin := s.Stdin(); stdin != nil {
|
|
if iop.Stdin != nil {
|
|
go func() {
|
|
io.Copy(iop.Stdin, stdin)
|
|
iop.Stdin.Close()
|
|
}()
|
|
}
|
|
} else {
|
|
if c != nil && !c.Config.Tty {
|
|
// tty is enabled, so dont close containerd's iopipe stdin.
|
|
if iop.Stdin != nil {
|
|
iop.Stdin.Close()
|
|
}
|
|
}
|
|
}
|
|
|
|
copyFunc := func(w io.Writer, r io.Reader) {
|
|
s.Add(1)
|
|
go func() {
|
|
if _, err := io.Copy(w, r); err != nil {
|
|
logrus.Errorf("%v stream copy error: %v", id, err)
|
|
}
|
|
s.Done()
|
|
}()
|
|
}
|
|
|
|
if iop.Stdout != nil {
|
|
copyFunc(s.Stdout(), iop.Stdout)
|
|
}
|
|
if iop.Stderr != nil {
|
|
copyFunc(s.Stderr(), iop.Stderr)
|
|
}
|
|
|
|
return nil
|
|
}
|