b6c7becbfe
This PR adds support for user-defined health-check probes for Docker containers. It adds a `HEALTHCHECK` instruction to the Dockerfile syntax plus some corresponding "docker run" options. It can be used with a restart policy to automatically restart a container if the check fails. The `HEALTHCHECK` instruction has two forms: * `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container) * `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image) The `HEALTHCHECK` instruction tells Docker how to test a container to check that it is still working. This can detect cases such as a web server that is stuck in an infinite loop and unable to handle new connections, even though the server process is still running. When a container has a healthcheck specified, it has a _health status_ in addition to its normal status. This status is initially `starting`. Whenever a health check passes, it becomes `healthy` (whatever state it was previously in). After a certain number of consecutive failures, it becomes `unhealthy`. The options that can appear before `CMD` are: * `--interval=DURATION` (default: `30s`) * `--timeout=DURATION` (default: `30s`) * `--retries=N` (default: `1`) The health check will first run **interval** seconds after the container is started, and then again **interval** seconds after each previous check completes. If a single run of the check takes longer than **timeout** seconds then the check is considered to have failed. It takes **retries** consecutive failures of the health check for the container to be considered `unhealthy`. There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list more than one then only the last `HEALTHCHECK` will take effect. The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands; see e.g. `ENTRYPOINT` for details). The command's exit status indicates the health status of the container. The possible values are: - 0: success - the container is healthy and ready for use - 1: unhealthy - the container is not working correctly - 2: starting - the container is not ready for use yet, but is working correctly If the probe returns 2 ("starting") when the container has already moved out of the "starting" state then it is treated as "unhealthy" instead. For example, to check every five minutes or so that a web-server is able to serve the site's main page within three seconds: HEALTHCHECK --interval=5m --timeout=3s \ CMD curl -f http://localhost/ || exit 1 To help debug failing probes, any output text (UTF-8 encoded) that the command writes on stdout or stderr will be stored in the health status and can be queried with `docker inspect`. Such output should be kept short (only the first 4096 bytes are stored currently). When the health status of a container changes, a `health_status` event is generated with the new status. The health status is also displayed in the `docker ps` output. Signed-off-by: Thomas Leonard <thomas.leonard@docker.com> Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
154 lines
4.9 KiB
Go
154 lines
4.9 KiB
Go
package main
|
|
|
|
import (
|
|
"encoding/json"
|
|
"github.com/docker/docker/pkg/integration/checker"
|
|
"github.com/docker/engine-api/types"
|
|
"github.com/go-check/check"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
func waitForStatus(c *check.C, name string, prev string, expected string) {
|
|
prev = prev + "\n"
|
|
expected = expected + "\n"
|
|
for {
|
|
out, _ := dockerCmd(c, "inspect", "--format={{.State.Status}}", name)
|
|
if out == expected {
|
|
return
|
|
}
|
|
c.Check(out, checker.Equals, prev)
|
|
if out != prev {
|
|
return
|
|
}
|
|
time.Sleep(100 * time.Millisecond)
|
|
}
|
|
}
|
|
|
|
func waitForHealthStatus(c *check.C, name string, prev string, expected string) {
|
|
prev = prev + "\n"
|
|
expected = expected + "\n"
|
|
for {
|
|
out, _ := dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", name)
|
|
if out == expected {
|
|
return
|
|
}
|
|
c.Check(out, checker.Equals, prev)
|
|
if out != prev {
|
|
return
|
|
}
|
|
time.Sleep(100 * time.Millisecond)
|
|
}
|
|
}
|
|
|
|
func getHealth(c *check.C, name string) *types.Health {
|
|
out, _ := dockerCmd(c, "inspect", "--format={{json .State.Health}}", name)
|
|
var health types.Health
|
|
err := json.Unmarshal([]byte(out), &health)
|
|
c.Check(err, checker.Equals, nil)
|
|
return &health
|
|
}
|
|
|
|
func (s *DockerSuite) TestHealth(c *check.C) {
|
|
testRequires(c, DaemonIsLinux) // busybox doesn't work on Windows
|
|
|
|
imageName := "testhealth"
|
|
_, err := buildImage(imageName,
|
|
`FROM busybox
|
|
RUN echo OK > /status
|
|
CMD ["/bin/sleep", "120"]
|
|
STOPSIGNAL SIGKILL
|
|
HEALTHCHECK --interval=1s --timeout=30s \
|
|
CMD cat /status`,
|
|
true)
|
|
|
|
c.Check(err, check.IsNil)
|
|
|
|
// No health status before starting
|
|
name := "test_health"
|
|
dockerCmd(c, "create", "--name", name, imageName)
|
|
out, _ := dockerCmd(c, "ps", "-a", "--format={{.Status}}")
|
|
c.Check(out, checker.Equals, "Created\n")
|
|
|
|
// Inspect the options
|
|
out, _ = dockerCmd(c, "inspect",
|
|
"--format='timeout={{.Config.Healthcheck.Timeout}} "+
|
|
"interval={{.Config.Healthcheck.Interval}} "+
|
|
"retries={{.Config.Healthcheck.Retries}} "+
|
|
"test={{.Config.Healthcheck.Test}}'", name)
|
|
c.Check(out, checker.Equals, "timeout=30s interval=1s retries=0 test=[CMD-SHELL cat /status]\n")
|
|
|
|
// Start
|
|
dockerCmd(c, "start", name)
|
|
waitForHealthStatus(c, name, "starting", "healthy")
|
|
|
|
// Make it fail
|
|
dockerCmd(c, "exec", name, "rm", "/status")
|
|
waitForHealthStatus(c, name, "healthy", "unhealthy")
|
|
|
|
// Inspect the status
|
|
out, _ = dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", name)
|
|
c.Check(out, checker.Equals, "unhealthy\n")
|
|
|
|
// Make it healthy again
|
|
dockerCmd(c, "exec", name, "touch", "/status")
|
|
waitForHealthStatus(c, name, "unhealthy", "healthy")
|
|
|
|
// Remove container
|
|
dockerCmd(c, "rm", "-f", name)
|
|
|
|
// Disable the check from the CLI
|
|
out, _ = dockerCmd(c, "create", "--name=noh", "--no-healthcheck", imageName)
|
|
out, _ = dockerCmd(c, "inspect", "--format={{.Config.Healthcheck.Test}}", "noh")
|
|
c.Check(out, checker.Equals, "[NONE]\n")
|
|
dockerCmd(c, "rm", "noh")
|
|
|
|
// Disable the check with a new build
|
|
_, err = buildImage("no_healthcheck",
|
|
`FROM testhealth
|
|
HEALTHCHECK NONE`, true)
|
|
c.Check(err, check.IsNil)
|
|
|
|
out, _ = dockerCmd(c, "inspect", "--format={{.ContainerConfig.Healthcheck.Test}}", "no_healthcheck")
|
|
c.Check(out, checker.Equals, "[NONE]\n")
|
|
|
|
// Enable the checks from the CLI
|
|
_, _ = dockerCmd(c, "run", "-d", "--name=fatal_healthcheck",
|
|
"--health-interval=0.5s",
|
|
"--health-retries=3",
|
|
"--health-cmd=cat /status",
|
|
"no_healthcheck")
|
|
waitForHealthStatus(c, "fatal_healthcheck", "starting", "healthy")
|
|
health := getHealth(c, "fatal_healthcheck")
|
|
c.Check(health.Status, checker.Equals, "healthy")
|
|
c.Check(health.FailingStreak, checker.Equals, 0)
|
|
last := health.Log[len(health.Log)-1]
|
|
c.Check(last.ExitCode, checker.Equals, 0)
|
|
c.Check(last.Output, checker.Equals, "OK\n")
|
|
|
|
// Fail the check, which should now make it exit
|
|
dockerCmd(c, "exec", "fatal_healthcheck", "rm", "/status")
|
|
waitForStatus(c, "fatal_healthcheck", "running", "exited")
|
|
|
|
out, _ = dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", "fatal_healthcheck")
|
|
c.Check(out, checker.Equals, "unhealthy\n")
|
|
failsStr, _ := dockerCmd(c, "inspect", "--format={{.State.Health.FailingStreak}}", "fatal_healthcheck")
|
|
fails, err := strconv.Atoi(strings.TrimSpace(failsStr))
|
|
c.Check(err, check.IsNil)
|
|
c.Check(fails >= 3, checker.Equals, true)
|
|
dockerCmd(c, "rm", "-f", "fatal_healthcheck")
|
|
|
|
// Check timeout
|
|
// Note: if the interval is too small, it seems that Docker spends all its time running health
|
|
// checks and never gets around to killing it.
|
|
_, _ = dockerCmd(c, "run", "-d", "--name=test",
|
|
"--health-interval=1s", "--health-cmd=sleep 5m", "--health-timeout=1ms", imageName)
|
|
waitForHealthStatus(c, "test", "starting", "unhealthy")
|
|
health = getHealth(c, "test")
|
|
last = health.Log[len(health.Log)-1]
|
|
c.Check(health.Status, checker.Equals, "unhealthy")
|
|
c.Check(last.ExitCode, checker.Equals, -1)
|
|
c.Check(last.Output, checker.Equals, "Health check exceeded timeout (1ms)")
|
|
dockerCmd(c, "rm", "-f", "test")
|
|
}
|