Browse Source

Add support for user-defined healthchecks

This PR adds support for user-defined health-check probes for Docker
containers. It adds a `HEALTHCHECK` instruction to the Dockerfile syntax plus
some corresponding "docker run" options. It can be used with a restart policy
to automatically restart a container if the check fails.

The `HEALTHCHECK` instruction has two forms:

* `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container)
* `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image)

The `HEALTHCHECK` instruction tells Docker how to test a container to check that
it is still working. This can detect cases such as a web server that is stuck in
an infinite loop and unable to handle new connections, even though the server
process is still running.

When a container has a healthcheck specified, it has a _health status_ in
addition to its normal status. This status is initially `starting`. Whenever a
health check passes, it becomes `healthy` (whatever state it was previously in).
After a certain number of consecutive failures, it becomes `unhealthy`.

The options that can appear before `CMD` are:

* `--interval=DURATION` (default: `30s`)
* `--timeout=DURATION` (default: `30s`)
* `--retries=N` (default: `1`)

The health check will first run **interval** seconds after the container is
started, and then again **interval** seconds after each previous check completes.

If a single run of the check takes longer than **timeout** seconds then the check
is considered to have failed.

It takes **retries** consecutive failures of the health check for the container
to be considered `unhealthy`.

There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list
more than one then only the last `HEALTHCHECK` will take effect.

The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK
CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands;
see e.g. `ENTRYPOINT` for details).

The command's exit status indicates the health status of the container.
The possible values are:

- 0: success - the container is healthy and ready for use
- 1: unhealthy - the container is not working correctly
- 2: starting - the container is not ready for use yet, but is working correctly

If the probe returns 2 ("starting") when the container has already moved out of the
"starting" state then it is treated as "unhealthy" instead.

For example, to check every five minutes or so that a web-server is able to
serve the site's main page within three seconds:

    HEALTHCHECK --interval=5m --timeout=3s \
      CMD curl -f http://localhost/ || exit 1

To help debug failing probes, any output text (UTF-8 encoded) that the command writes
on stdout or stderr will be stored in the health status and can be queried with
`docker inspect`. Such output should be kept short (only the first 4096 bytes
are stored currently).

When the health status of a container changes, a `health_status` event is
generated with the new status. The health status is also displayed in the
`docker ps` output.

Signed-off-by: Thomas Leonard <thomas.leonard@docker.com>
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
Thomas Leonard 9 years ago
parent
commit
b6c7becbfe

+ 1 - 1
api/server/router/container/backend.go

@@ -17,7 +17,7 @@ type execBackend interface {
 	ContainerExecCreate(name string, config *types.ExecConfig) (string, error)
 	ContainerExecInspect(id string) (*backend.ExecInspect, error)
 	ContainerExecResize(name string, height, width int) error
-	ContainerExecStart(name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) error
+	ContainerExecStart(ctx context.Context, name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) error
 	ExecExists(name string) (bool, error)
 }
 

+ 2 - 1
api/server/router/container/exec.go

@@ -106,7 +106,8 @@ func (s *containerRouter) postContainerExecStart(ctx context.Context, w http.Res
 	}
 
 	// Now run the user process in container.
-	if err := s.backend.ContainerExecStart(execName, stdin, stdout, stderr); err != nil {
+	// Maybe we should we pass ctx here if we're not detaching?
+	if err := s.backend.ContainerExecStart(context.Background(), execName, stdin, stdout, stderr); err != nil {
 		if execStartCheck.Detach {
 			return err
 		}

+ 10 - 9
builder/dockerfile/builder.go

@@ -22,15 +22,16 @@ import (
 )
 
 var validCommitCommands = map[string]bool{
-	"cmd":        true,
-	"entrypoint": true,
-	"env":        true,
-	"expose":     true,
-	"label":      true,
-	"onbuild":    true,
-	"user":       true,
-	"volume":     true,
-	"workdir":    true,
+	"cmd":         true,
+	"entrypoint":  true,
+	"healthcheck": true,
+	"env":         true,
+	"expose":      true,
+	"label":       true,
+	"onbuild":     true,
+	"user":        true,
+	"volume":      true,
+	"workdir":     true,
 }
 
 // BuiltinAllowedBuildArgs is list of built-in allowed build args

+ 34 - 32
builder/dockerfile/command/command.go

@@ -3,40 +3,42 @@ package command
 
 // Define constants for the command strings
 const (
-	Env        = "env"
-	Label      = "label"
-	Maintainer = "maintainer"
-	Add        = "add"
-	Copy       = "copy"
-	From       = "from"
-	Onbuild    = "onbuild"
-	Workdir    = "workdir"
-	Run        = "run"
-	Cmd        = "cmd"
-	Entrypoint = "entrypoint"
-	Expose     = "expose"
-	Volume     = "volume"
-	User       = "user"
-	StopSignal = "stopsignal"
-	Arg        = "arg"
+	Env         = "env"
+	Label       = "label"
+	Maintainer  = "maintainer"
+	Add         = "add"
+	Copy        = "copy"
+	From        = "from"
+	Onbuild     = "onbuild"
+	Workdir     = "workdir"
+	Run         = "run"
+	Cmd         = "cmd"
+	Entrypoint  = "entrypoint"
+	Expose      = "expose"
+	Volume      = "volume"
+	User        = "user"
+	StopSignal  = "stopsignal"
+	Arg         = "arg"
+	Healthcheck = "healthcheck"
 )
 
 // Commands is list of all Dockerfile commands
 var Commands = map[string]struct{}{
-	Env:        {},
-	Label:      {},
-	Maintainer: {},
-	Add:        {},
-	Copy:       {},
-	From:       {},
-	Onbuild:    {},
-	Workdir:    {},
-	Run:        {},
-	Cmd:        {},
-	Entrypoint: {},
-	Expose:     {},
-	Volume:     {},
-	User:       {},
-	StopSignal: {},
-	Arg:        {},
+	Env:         {},
+	Label:       {},
+	Maintainer:  {},
+	Add:         {},
+	Copy:        {},
+	From:        {},
+	Onbuild:     {},
+	Workdir:     {},
+	Run:         {},
+	Cmd:         {},
+	Entrypoint:  {},
+	Expose:      {},
+	Volume:      {},
+	User:        {},
+	StopSignal:  {},
+	Arg:         {},
+	Healthcheck: {},
 }

+ 107 - 0
builder/dockerfile/dispatchers.go

@@ -12,7 +12,9 @@ import (
 	"regexp"
 	"runtime"
 	"sort"
+	"strconv"
 	"strings"
+	"time"
 
 	"github.com/Sirupsen/logrus"
 	"github.com/docker/docker/api"
@@ -426,6 +428,111 @@ func cmd(b *Builder, args []string, attributes map[string]bool, original string)
 	return nil
 }
 
+// parseOptInterval(flag) is the duration of flag.Value, or 0 if
+// empty. An error is reported if the value is given and is not positive.
+func parseOptInterval(f *Flag) (time.Duration, error) {
+	s := f.Value
+	if s == "" {
+		return 0, nil
+	}
+	d, err := time.ParseDuration(s)
+	if err != nil {
+		return 0, err
+	}
+	if d <= 0 {
+		return 0, fmt.Errorf("Interval %#v must be positive", f.name)
+	}
+	return d, nil
+}
+
+// HEALTHCHECK foo
+//
+// Set the default healthcheck command to run in the container (which may be empty).
+// Argument handling is the same as RUN.
+//
+func healthcheck(b *Builder, args []string, attributes map[string]bool, original string) error {
+	if len(args) == 0 {
+		return fmt.Errorf("HEALTHCHECK requires an argument")
+	}
+	typ := strings.ToUpper(args[0])
+	args = args[1:]
+	if typ == "NONE" {
+		if len(args) != 0 {
+			return fmt.Errorf("HEALTHCHECK NONE takes no arguments")
+		}
+		test := strslice.StrSlice{typ}
+		b.runConfig.Healthcheck = &container.HealthConfig{
+			Test: test,
+		}
+	} else {
+		if b.runConfig.Healthcheck != nil {
+			oldCmd := b.runConfig.Healthcheck.Test
+			if len(oldCmd) > 0 && oldCmd[0] != "NONE" {
+				fmt.Fprintf(b.Stdout, "Note: overriding previous HEALTHCHECK: %v\n", oldCmd)
+			}
+		}
+
+		healthcheck := container.HealthConfig{}
+
+		flInterval := b.flags.AddString("interval", "")
+		flTimeout := b.flags.AddString("timeout", "")
+		flRetries := b.flags.AddString("retries", "")
+
+		if err := b.flags.Parse(); err != nil {
+			return err
+		}
+
+		switch typ {
+		case "CMD":
+			cmdSlice := handleJSONArgs(args, attributes)
+			if len(cmdSlice) == 0 {
+				return fmt.Errorf("Missing command after HEALTHCHECK CMD")
+			}
+
+			if !attributes["json"] {
+				typ = "CMD-SHELL"
+			}
+
+			healthcheck.Test = strslice.StrSlice(append([]string{typ}, cmdSlice...))
+		default:
+			return fmt.Errorf("Unknown type %#v in HEALTHCHECK (try CMD)", typ)
+		}
+
+		interval, err := parseOptInterval(flInterval)
+		if err != nil {
+			return err
+		}
+		healthcheck.Interval = interval
+
+		timeout, err := parseOptInterval(flTimeout)
+		if err != nil {
+			return err
+		}
+		healthcheck.Timeout = timeout
+
+		if flRetries.Value != "" {
+			retries, err := strconv.ParseInt(flRetries.Value, 10, 32)
+			if err != nil {
+				return err
+			}
+			if retries < 1 {
+				return fmt.Errorf("--retries must be at least 1 (not %d)", retries)
+			}
+			healthcheck.Retries = int(retries)
+		} else {
+			healthcheck.Retries = 0
+		}
+
+		b.runConfig.Healthcheck = &healthcheck
+	}
+
+	if err := b.commit("", b.runConfig.Cmd, fmt.Sprintf("HEALTHCHECK %q", b.runConfig.Healthcheck)); err != nil {
+		return err
+	}
+
+	return nil
+}
+
 // ENTRYPOINT /usr/sbin/nginx
 //
 // Set the entrypoint (which defaults to sh -c on linux, or cmd /S /C on Windows) to

+ 17 - 16
builder/dockerfile/evaluator.go

@@ -58,22 +58,23 @@ var evaluateTable map[string]func(*Builder, []string, map[string]bool, string) e
 
 func init() {
 	evaluateTable = map[string]func(*Builder, []string, map[string]bool, string) error{
-		command.Env:        env,
-		command.Label:      label,
-		command.Maintainer: maintainer,
-		command.Add:        add,
-		command.Copy:       dispatchCopy, // copy() is a go builtin
-		command.From:       from,
-		command.Onbuild:    onbuild,
-		command.Workdir:    workdir,
-		command.Run:        run,
-		command.Cmd:        cmd,
-		command.Entrypoint: entrypoint,
-		command.Expose:     expose,
-		command.Volume:     volume,
-		command.User:       user,
-		command.StopSignal: stopSignal,
-		command.Arg:        arg,
+		command.Env:         env,
+		command.Label:       label,
+		command.Maintainer:  maintainer,
+		command.Add:         add,
+		command.Copy:        dispatchCopy, // copy() is a go builtin
+		command.From:        from,
+		command.Onbuild:     onbuild,
+		command.Workdir:     workdir,
+		command.Run:         run,
+		command.Cmd:         cmd,
+		command.Entrypoint:  entrypoint,
+		command.Expose:      expose,
+		command.Volume:      volume,
+		command.User:        user,
+		command.StopSignal:  stopSignal,
+		command.Arg:         arg,
+		command.Healthcheck: healthcheck,
 	}
 }
 

+ 29 - 0
builder/dockerfile/parser/line_parsers.go

@@ -329,3 +329,32 @@ func parseMaybeJSONToList(rest string) (*Node, map[string]bool, error) {
 
 	return parseStringsWhitespaceDelimited(rest)
 }
+
+// The HEALTHCHECK command is like parseMaybeJSON, but has an extra type argument.
+func parseHealthConfig(rest string) (*Node, map[string]bool, error) {
+	// Find end of first argument
+	var sep int
+	for ; sep < len(rest); sep++ {
+		if unicode.IsSpace(rune(rest[sep])) {
+			break
+		}
+	}
+	next := sep
+	for ; next < len(rest); next++ {
+		if !unicode.IsSpace(rune(rest[next])) {
+			break
+		}
+	}
+
+	if sep == 0 {
+		return nil, nil, nil
+	}
+
+	typ := rest[:sep]
+	cmd, attrs, err := parseMaybeJSON(rest[next:])
+	if err != nil {
+		return nil, nil, err
+	}
+
+	return &Node{Value: typ, Next: cmd, Attributes: attrs}, nil, err
+}

+ 17 - 16
builder/dockerfile/parser/parser.go

@@ -66,22 +66,23 @@ func init() {
 	// functions. Errors are propagated up by Parse() and the resulting AST can
 	// be incorporated directly into the existing AST as a next.
 	dispatch = map[string]func(string) (*Node, map[string]bool, error){
-		command.User:       parseString,
-		command.Onbuild:    parseSubCommand,
-		command.Workdir:    parseString,
-		command.Env:        parseEnv,
-		command.Label:      parseLabel,
-		command.Maintainer: parseString,
-		command.From:       parseString,
-		command.Add:        parseMaybeJSONToList,
-		command.Copy:       parseMaybeJSONToList,
-		command.Run:        parseMaybeJSON,
-		command.Cmd:        parseMaybeJSON,
-		command.Entrypoint: parseMaybeJSON,
-		command.Expose:     parseStringsWhitespaceDelimited,
-		command.Volume:     parseMaybeJSONToList,
-		command.StopSignal: parseString,
-		command.Arg:        parseNameOrNameVal,
+		command.User:        parseString,
+		command.Onbuild:     parseSubCommand,
+		command.Workdir:     parseString,
+		command.Env:         parseEnv,
+		command.Label:       parseLabel,
+		command.Maintainer:  parseString,
+		command.From:        parseString,
+		command.Add:         parseMaybeJSONToList,
+		command.Copy:        parseMaybeJSONToList,
+		command.Run:         parseMaybeJSON,
+		command.Cmd:         parseMaybeJSON,
+		command.Entrypoint:  parseMaybeJSON,
+		command.Expose:      parseStringsWhitespaceDelimited,
+		command.Volume:      parseMaybeJSONToList,
+		command.StopSignal:  parseString,
+		command.Arg:         parseNameOrNameVal,
+		command.Healthcheck: parseHealthConfig,
 	}
 }
 

+ 10 - 0
builder/dockerfile/parser/testfiles/health/Dockerfile

@@ -0,0 +1,10 @@
+FROM debian
+ADD check.sh main.sh /app/
+CMD /app/main.sh
+HEALTHCHECK
+HEALTHCHECK --interval=5s --timeout=3s --retries=1 \
+  CMD /app/check.sh --quiet
+HEALTHCHECK CMD
+HEALTHCHECK   CMD   a b
+HEALTHCHECK --timeout=3s CMD ["foo"]
+HEALTHCHECK CONNECT TCP 7000

+ 9 - 0
builder/dockerfile/parser/testfiles/health/result

@@ -0,0 +1,9 @@
+(from "debian")
+(add "check.sh" "main.sh" "/app/")
+(cmd "/app/main.sh")
+(healthcheck)
+(healthcheck ["--interval=5s" "--timeout=3s" "--retries=1"] "CMD" "/app/check.sh --quiet")
+(healthcheck "CMD")
+(healthcheck "CMD" "a b")
+(healthcheck ["--timeout=3s"] "CMD" "foo")
+(healthcheck "CONNECT" "TCP 7000")

+ 49 - 0
container/health.go

@@ -0,0 +1,49 @@
+package container
+
+import (
+	"github.com/Sirupsen/logrus"
+	"github.com/docker/engine-api/types"
+)
+
+// Health holds the current container health-check state
+type Health struct {
+	types.Health
+	stop chan struct{} // Write struct{} to stop the monitor
+}
+
+// String returns a human-readable description of the health-check state
+func (s *Health) String() string {
+	if s.stop == nil {
+		return "no healthcheck"
+	}
+	switch s.Status {
+	case types.Starting:
+		return "health: starting"
+	default: // Healthy and Unhealthy are clear on their own
+		return s.Status
+	}
+}
+
+// OpenMonitorChannel creates and returns a new monitor channel. If there already is one,
+// it returns nil.
+func (s *Health) OpenMonitorChannel() chan struct{} {
+	if s.stop == nil {
+		logrus.Debugf("OpenMonitorChannel")
+		s.stop = make(chan struct{})
+		return s.stop
+	}
+	return nil
+}
+
+// CloseMonitorChannel closes any existing monitor channel.
+func (s *Health) CloseMonitorChannel() {
+	if s.stop != nil {
+		logrus.Debugf("CloseMonitorChannel: waiting for probe to stop")
+		// This channel does not buffer. Once the write succeeds, the monitor
+		// has read the stop request and will not make any further updates
+		// to c.State.Health.
+		s.stop <- struct{}{}
+		s.stop = nil
+		logrus.Debugf("CloseMonitorChannel done")
+	}
+}

+ 4 - 0
container/state.go

@@ -27,6 +27,7 @@ type State struct {
 	StartedAt         time.Time
 	FinishedAt        time.Time
 	waitChan          chan struct{}
+	Health            *Health
 }
 
 // NewState creates a default state object with a fresh channel for state changes.
@@ -46,6 +47,9 @@ func (s *State) String() string {
 			return fmt.Sprintf("Restarting (%d) %s ago", s.ExitCode, units.HumanDuration(time.Now().UTC().Sub(s.FinishedAt)))
 		}
 
+		if h := s.Health; h != nil {
+			return fmt.Sprintf("Up %s (%s)", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)), h.String())
+		}
 		return fmt.Sprintf("Up %s", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)))
 	}
 

+ 19 - 0
daemon/commit.go

@@ -80,6 +80,25 @@ func merge(userConf, imageConf *containertypes.Config) error {
 			userConf.Entrypoint = imageConf.Entrypoint
 		}
 	}
+	if imageConf.Healthcheck != nil {
+		if userConf.Healthcheck == nil {
+			userConf.Healthcheck = imageConf.Healthcheck
+		} else {
+			if len(userConf.Healthcheck.Test) == 0 {
+				userConf.Healthcheck.Test = imageConf.Healthcheck.Test
+			}
+			if userConf.Healthcheck.Interval == 0 {
+				userConf.Healthcheck.Interval = imageConf.Healthcheck.Interval
+			}
+			if userConf.Healthcheck.Timeout == 0 {
+				userConf.Healthcheck.Timeout = imageConf.Healthcheck.Timeout
+			}
+			if userConf.Healthcheck.Retries == 0 {
+				userConf.Healthcheck.Retries = imageConf.Healthcheck.Retries
+			}
+		}
+	}
+
 	if userConf.WorkingDir == "" {
 		userConf.WorkingDir = imageConf.WorkingDir
 	}

+ 23 - 5
daemon/exec.go

@@ -14,11 +14,15 @@ import (
 	"github.com/docker/docker/errors"
 	"github.com/docker/docker/libcontainerd"
 	"github.com/docker/docker/pkg/pools"
+	"github.com/docker/docker/pkg/signal"
 	"github.com/docker/docker/pkg/term"
 	"github.com/docker/engine-api/types"
 	"github.com/docker/engine-api/types/strslice"
 )
 
+// Seconds to wait after sending TERM before trying KILL
+const termProcessTimeout = 10
+
 func (d *Daemon) registerExecCommand(container *container.Container, config *exec.Config) {
 	// Storing execs in container in order to kill them gracefully whenever the container is stopped or removed.
 	container.ExecCommands.Add(config.ID, config)
@@ -130,7 +134,8 @@ func (d *Daemon) ContainerExecCreate(name string, config *types.ExecConfig) (str
 
 // ContainerExecStart starts a previously set up exec instance. The
 // std streams are set up.
-func (d *Daemon) ContainerExecStart(name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) (err error) {
+// If ctx is cancelled, the process is terminated.
+func (d *Daemon) ContainerExecStart(ctx context.Context, name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) (err error) {
 	var (
 		cStdin           io.ReadCloser
 		cStdout, cStderr io.Writer
@@ -197,15 +202,28 @@ func (d *Daemon) ContainerExecStart(name string, stdin io.ReadCloser, stdout io.
 		return nil
 	}
 
-	attachErr := container.AttachStreams(context.Background(), ec.StreamConfig, ec.OpenStdin, true, ec.Tty, cStdin, cStdout, cStderr, ec.DetachKeys)
+	attachErr := container.AttachStreams(ctx, ec.StreamConfig, ec.OpenStdin, true, ec.Tty, cStdin, cStdout, cStderr, ec.DetachKeys)
 
 	if err := d.containerd.AddProcess(c.ID, name, p); err != nil {
 		return err
 	}
 
-	err = <-attachErr
-	if err != nil {
-		return fmt.Errorf("attach failed with error: %v", err)
+	select {
+	case <-ctx.Done():
+		logrus.Debugf("Sending TERM signal to process %v in container %v", name, c.ID)
+		d.containerd.SignalProcess(c.ID, name, int(signal.SignalMap["TERM"]))
+		select {
+		case <-time.After(termProcessTimeout * time.Second):
+			logrus.Infof("Container %v, process %v failed to exit within %d seconds of signal TERM - using the force", c.ID, name, termProcessTimeout)
+			d.containerd.SignalProcess(c.ID, name, int(signal.SignalMap["KILL"]))
+		case <-attachErr:
+			// TERM signal worked
+		}
+		return fmt.Errorf("context cancelled")
+	case err := <-attachErr:
+		if err != nil {
+			return fmt.Errorf("attach failed with error: %v", err)
+		}
 	}
 	return nil
 }

+ 314 - 0
daemon/health.go

@@ -0,0 +1,314 @@
+package daemon
+
+import (
+	"bytes"
+	"fmt"
+	"runtime"
+	"strings"
+	"time"
+
+	"golang.org/x/net/context"
+
+	"github.com/Sirupsen/logrus"
+	"github.com/docker/docker/container"
+	"github.com/docker/docker/daemon/exec"
+	"github.com/docker/engine-api/types"
+	"github.com/docker/engine-api/types/strslice"
+)
+
+const (
+	// Longest healthcheck probe output message to store. Longer messages will be truncated.
+	maxOutputLen = 4096
+
+	// Default interval between probe runs (from the end of the first to the start of the second).
+	// Also the time before the first probe.
+	defaultProbeInterval = 30 * time.Second
+
+	// The maximum length of time a single probe run should take. If the probe takes longer
+	// than this, the check is considered to have failed.
+	defaultProbeTimeout = 30 * time.Second
+
+	// Shut down a container if it becomes Unhealthy.
+	defaultExitOnUnhealthy = true
+
+	// Maximum number of entries to record
+	maxLogEntries = 5
+)
+
+const (
+	// Exit status codes that can be returned by the probe command.
+
+	exitStatusHealthy   = 0 // Container is healthy
+	exitStatusUnhealthy = 1 // Container is unhealthy
+	exitStatusStarting  = 2 // Container needs more time to start
+)
+
+// probe implementations know how to run a particular type of probe.
+type probe interface {
+	// Perform one run of the check. Returns the exit code and an optional
+	// short diagnostic string.
+	run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
+}
+
+// cmdProbe implements the "CMD" probe type.
+type cmdProbe struct {
+	// Run the command with the system's default shell instead of execing it directly.
+	shell bool
+}
+
+// exec the healthcheck command in the container.
+// Returns the exit code and probe output (if any)
+func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) {
+	cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:]
+	if p.shell {
+		if runtime.GOOS != "windows" {
+			cmdSlice = append([]string{"/bin/sh", "-c"}, cmdSlice...)
+		} else {
+			cmdSlice = append([]string{"cmd", "/S", "/C"}, cmdSlice...)
+		}
+	}
+	entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
+	execConfig := exec.NewConfig()
+	execConfig.OpenStdin = false
+	execConfig.OpenStdout = true
+	execConfig.OpenStderr = true
+	execConfig.ContainerID = container.ID
+	execConfig.DetachKeys = []byte{}
+	execConfig.Entrypoint = entrypoint
+	execConfig.Args = args
+	execConfig.Tty = false
+	execConfig.Privileged = false
+	execConfig.User = container.Config.User
+
+	d.registerExecCommand(container, execConfig)
+	d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
+
+	output := &limitedBuffer{}
+	err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
+	if err != nil {
+		return nil, err
+	}
+	info, err := d.getExecConfig(execConfig.ID)
+	if err != nil {
+		return nil, err
+	}
+	if info.ExitCode == nil {
+		return nil, fmt.Errorf("Healthcheck has no exit code!")
+	}
+	// Note: Go's json package will handle invalid UTF-8 for us
+	out := output.String()
+	return &types.HealthcheckResult{
+		End:      time.Now(),
+		ExitCode: *info.ExitCode,
+		Output:   out,
+	}, nil
+}
+
+// Update the container's Status.Health struct based on the latest probe's result.
+func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) {
+	c.Lock()
+	defer c.Unlock()
+
+	retries := c.Config.Healthcheck.Retries
+	if retries <= 0 {
+		retries = 1 // Default if unset or set to an invalid value
+	}
+
+	h := c.State.Health
+	oldStatus := h.Status
+
+	if len(h.Log) >= maxLogEntries {
+		h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
+	} else {
+		h.Log = append(h.Log, result)
+	}
+
+	if result.ExitCode == exitStatusHealthy {
+		h.FailingStreak = 0
+		h.Status = types.Healthy
+	} else if result.ExitCode == exitStatusStarting && c.State.Health.Status == types.Starting {
+		// The container is not ready yet. Remain in the starting state.
+	} else {
+		// Failure (incuding invalid exit code)
+		h.FailingStreak++
+		if c.State.Health.FailingStreak >= retries {
+			h.Status = types.Unhealthy
+		}
+		// Else we're starting or healthy. Stay in that state.
+	}
+
+	if oldStatus != h.Status {
+		d.LogContainerEvent(c, "health_status: "+h.Status)
+	}
+}
+
+// Run the container's monitoring thread until notified via "stop".
+// There is never more than one monitor thread running per container at a time.
+func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
+	probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
+	probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
+	for {
+		select {
+		case <-stop:
+			logrus.Debugf("Stop healthcheck monitoring (received while idle)")
+			return
+		case <-time.After(probeInterval):
+			logrus.Debugf("Running health check...")
+			startTime := time.Now()
+			ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
+			results := make(chan *types.HealthcheckResult)
+			go func() {
+				result, err := probe.run(ctx, d, c)
+				if err != nil {
+					logrus.Warnf("Health check error: %v", err)
+					results <- &types.HealthcheckResult{
+						ExitCode: -1,
+						Output:   err.Error(),
+						Start:    startTime,
+						End:      time.Now(),
+					}
+				} else {
+					result.Start = startTime
+					logrus.Debugf("Health check done (exitCode=%d)", result.ExitCode)
+					results <- result
+				}
+				close(results)
+			}()
+			select {
+			case <-stop:
+				logrus.Debugf("Stop healthcheck monitoring (received while probing)")
+				// Stop timeout and kill probe, but don't wait for probe to exit.
+				cancelProbe()
+				return
+			case result := <-results:
+				handleProbeResult(d, c, result)
+				// Stop timeout
+				cancelProbe()
+			case <-ctx.Done():
+				logrus.Debugf("Health check taking too long")
+				handleProbeResult(d, c, &types.HealthcheckResult{
+					ExitCode: -1,
+					Output:   fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
+					Start:    startTime,
+					End:      time.Now(),
+				})
+				cancelProbe()
+				// Wait for probe to exit (it might take a while to respond to the TERM
+				// signal and we don't want dying probes to pile up).
+				<-results
+			}
+		}
+	}
+}
+
+// Get a suitable probe implementation for the container's healthcheck configuration.
+func getProbe(c *container.Container) probe {
+	config := c.Config.Healthcheck
+	if config == nil || len(config.Test) == 0 {
+		return nil
+	}
+	switch config.Test[0] {
+	case "CMD":
+		return &cmdProbe{shell: false}
+	case "CMD-SHELL":
+		return &cmdProbe{shell: true}
+	default:
+		logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD')", config.Test[0])
+		return nil
+	}
+}
+
+// Ensure the health-check monitor is running or not, depending on the current
+// state of the container.
+// Called from monitor.go, with c locked.
+func (d *Daemon) updateHealthMonitor(c *container.Container) {
+	h := c.State.Health
+	if h == nil {
+		return // No healthcheck configured
+	}
+
+	probe := getProbe(c)
+	wantRunning := c.Running && !c.Paused && probe != nil
+	if wantRunning {
+		if stop := h.OpenMonitorChannel(); stop != nil {
+			go monitor(d, c, stop, probe)
+		}
+	} else {
+		h.CloseMonitorChannel()
+	}
+}
+
+// Reset the health state for a newly-started, restarted or restored container.
+// initHealthMonitor is called from monitor.go and we should never be running
+// two instances at once.
+// Called with c locked.
+func (d *Daemon) initHealthMonitor(c *container.Container) {
+	if c.Config.Healthcheck == nil {
+		return
+	}
+
+	// This is needed in case we're auto-restarting
+	d.stopHealthchecks(c)
+
+	if c.State.Health == nil {
+		h := &container.Health{}
+		h.Status = types.Starting
+		h.FailingStreak = 0
+		c.State.Health = h
+	}
+
+	d.updateHealthMonitor(c)
+}
+
+// Called when the container is being stopped (whether because the health check is
+// failing or for any other reason).
+func (d *Daemon) stopHealthchecks(c *container.Container) {
+	h := c.State.Health
+	if h != nil {
+		h.CloseMonitorChannel()
+	}
+}
+
+// Buffer up to maxOutputLen bytes. Further data is discarded.
+type limitedBuffer struct {
+	buf       bytes.Buffer
+	truncated bool // indicates that data has been lost
+}
+
+// Append to limitedBuffer while there is room.
+func (b *limitedBuffer) Write(data []byte) (int, error) {
+	bufLen := b.buf.Len()
+	dataLen := len(data)
+	keep := min(maxOutputLen-bufLen, dataLen)
+	if keep > 0 {
+		b.buf.Write(data[:keep])
+	}
+	if keep < dataLen {
+		b.truncated = true
+	}
+	return dataLen, nil
+}
+
+// The contents of the buffer, with "..." appended if it overflowed.
+func (b *limitedBuffer) String() string {
+	out := b.buf.String()
+	if b.truncated {
+		out = out + "..."
+	}
+	return out
+}
+
+// If configuredValue is zero, use defaultValue instead.
+func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
+	if configuredValue == 0 {
+		return defaultValue
+	}
+	return configuredValue
+}
+
+func min(x, y int) int {
+	if x < y {
+		return x
+	}
+	return y
+}

+ 112 - 0
daemon/health_test.go

@@ -0,0 +1,112 @@
+package daemon
+
+import (
+	"testing"
+	"time"
+
+	"github.com/docker/docker/container"
+	"github.com/docker/docker/daemon/events"
+	"github.com/docker/engine-api/types"
+	containertypes "github.com/docker/engine-api/types/container"
+	eventtypes "github.com/docker/engine-api/types/events"
+)
+
+func reset(c *container.Container) {
+	c.State = &container.State{}
+	c.State.Health = &container.Health{}
+	c.State.Health.Status = types.Starting
+}
+
+func TestHealthStates(t *testing.T) {
+	e := events.New()
+	_, l, _ := e.Subscribe()
+	defer e.Evict(l)
+
+	expect := func(expected string) {
+		select {
+		case event := <-l:
+			ev := event.(eventtypes.Message)
+			if ev.Status != expected {
+				t.Errorf("Expecting event %#v, but got %#v\n", expected, ev.Status)
+			}
+		case <-time.After(1 * time.Second):
+			t.Errorf("Expecting event %#v, but got nothing\n", expected)
+		}
+	}
+
+	c := &container.Container{
+		CommonContainer: container.CommonContainer{
+			ID:   "container_id",
+			Name: "container_name",
+			Config: &containertypes.Config{
+				Image: "image_name",
+			},
+		},
+	}
+	daemon := &Daemon{
+		EventsService: e,
+	}
+
+	c.Config.Healthcheck = &containertypes.HealthConfig{
+		Retries: 1,
+	}
+
+	reset(c)
+
+	handleResult := func(startTime time.Time, exitCode int) {
+		handleProbeResult(daemon, c, &types.HealthcheckResult{
+			Start:    startTime,
+			End:      startTime,
+			ExitCode: exitCode,
+		})
+	}
+
+	// starting -> failed -> success -> failed
+
+	handleResult(c.State.StartedAt.Add(1*time.Second), 1)
+	expect("health_status: unhealthy")
+
+	handleResult(c.State.StartedAt.Add(2*time.Second), 0)
+	expect("health_status: healthy")
+
+	handleResult(c.State.StartedAt.Add(3*time.Second), 1)
+	expect("health_status: unhealthy")
+
+	// starting -> starting -> starting ->
+	// healthy -> starting (invalid transition)
+
+	reset(c)
+
+	handleResult(c.State.StartedAt.Add(20*time.Second), 2)
+	handleResult(c.State.StartedAt.Add(40*time.Second), 2)
+	if c.State.Health.Status != types.Starting {
+		t.Errorf("Expecting starting, but got %#v\n", c.State.Health.Status)
+	}
+
+	handleResult(c.State.StartedAt.Add(50*time.Second), 0)
+	expect("health_status: healthy")
+	handleResult(c.State.StartedAt.Add(60*time.Second), 2)
+	expect("health_status: unhealthy")
+
+	// Test retries
+
+	reset(c)
+	c.Config.Healthcheck.Retries = 3
+
+	handleResult(c.State.StartedAt.Add(20*time.Second), 1)
+	handleResult(c.State.StartedAt.Add(40*time.Second), 1)
+	if c.State.Health.Status != types.Starting {
+		t.Errorf("Expecting starting, but got %#v\n", c.State.Health.Status)
+	}
+	if c.State.Health.FailingStreak != 2 {
+		t.Errorf("Expecting FailingStreak=2, but got %d\n", c.State.Health.FailingStreak)
+	}
+	handleResult(c.State.StartedAt.Add(60*time.Second), 1)
+	expect("health_status: unhealthy")
+
+	handleResult(c.State.StartedAt.Add(80*time.Second), 0)
+	expect("health_status: healthy")
+	if c.State.Health.FailingStreak != 0 {
+		t.Errorf("Expecting FailingStreak=0, but got %d\n", c.State.Health.FailingStreak)
+	}
+}

+ 10 - 0
daemon/inspect.go

@@ -108,6 +108,15 @@ func (daemon *Daemon) getInspectData(container *container.Container, size bool)
 		hostConfig.Links = append(hostConfig.Links, fmt.Sprintf("%s:%s", child.Name, linkAlias))
 	}
 
+	var containerHealth *types.Health
+	if container.State.Health != nil {
+		containerHealth = &types.Health{
+			Status:        container.State.Health.Status,
+			FailingStreak: container.State.Health.FailingStreak,
+			Log:           append([]*types.HealthcheckResult{}, container.State.Health.Log...),
+		}
+	}
+
 	containerState := &types.ContainerState{
 		Status:     container.State.StateString(),
 		Running:    container.State.Running,
@@ -120,6 +129,7 @@ func (daemon *Daemon) getInspectData(container *container.Container, size bool)
 		Error:      container.State.Error,
 		StartedAt:  container.State.StartedAt.Format(time.RFC3339Nano),
 		FinishedAt: container.State.FinishedAt.Format(time.RFC3339Nano),
+		Health:     containerHealth,
 	}
 
 	contJSONBase := &types.ContainerJSONBase{

+ 9 - 0
daemon/monitor.go

@@ -25,6 +25,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
 		if runtime.GOOS == "windows" {
 			return errors.New("Received StateOOM from libcontainerd on Windows. This should never happen.")
 		}
+		daemon.updateHealthMonitor(c)
 		daemon.LogContainerEvent(c, "oom")
 	case libcontainerd.StateExit:
 		c.Lock()
@@ -35,6 +36,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
 		attributes := map[string]string{
 			"exitCode": strconv.Itoa(int(e.ExitCode)),
 		}
+		daemon.updateHealthMonitor(c)
 		daemon.LogContainerEventWithAttributes(c, "die", attributes)
 		daemon.Cleanup(c)
 		// FIXME: here is race condition between two RUN instructions in Dockerfile
@@ -54,6 +56,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
 			"exitCode": strconv.Itoa(int(e.ExitCode)),
 		}
 		daemon.LogContainerEventWithAttributes(c, "die", attributes)
+		daemon.updateHealthMonitor(c)
 		return c.ToDisk()
 	case libcontainerd.StateExitProcess:
 		c.Lock()
@@ -74,18 +77,24 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
 			logrus.Warnf("Ignoring StateExitProcess for %v but no exec command found", e)
 		}
 	case libcontainerd.StateStart, libcontainerd.StateRestore:
+		// Container is already locked in this case
 		c.SetRunning(int(e.Pid), e.State == libcontainerd.StateStart)
 		c.HasBeenManuallyStopped = false
 		if err := c.ToDisk(); err != nil {
 			c.Reset(false)
 			return err
 		}
+		daemon.initHealthMonitor(c)
 		daemon.LogContainerEvent(c, "start")
 	case libcontainerd.StatePause:
+		// Container is already locked in this case
 		c.Paused = true
+		daemon.updateHealthMonitor(c)
 		daemon.LogContainerEvent(c, "pause")
 	case libcontainerd.StateResume:
+		// Container is already locked in this case
 		c.Paused = false
+		daemon.updateHealthMonitor(c)
 		daemon.LogContainerEvent(c, "unpause")
 	}
 

+ 2 - 0
daemon/stop.go

@@ -41,6 +41,8 @@ func (daemon *Daemon) containerStop(container *container.Container, seconds int)
 		return nil
 	}
 
+	daemon.stopHealthchecks(container)
+
 	stopSignal := container.StopSignal()
 	// 1. Send a stop signal
 	if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil {

+ 67 - 0
docs/reference/builder.md

@@ -1470,6 +1470,73 @@ The `STOPSIGNAL` instruction sets the system call signal that will be sent to th
 This signal can be a valid unsigned number that matches a position in the kernel's syscall table, for instance 9,
 or a signal name in the format SIGNAME, for instance SIGKILL.
 
+## HEALTHCHECK
+
+The `HEALTHCHECK` instruction has two forms:
+
+* `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container)
+* `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image)
+
+The `HEALTHCHECK` instruction tells Docker how to test a container to check that
+it is still working. This can detect cases such as a web server that is stuck in
+an infinite loop and unable to handle new connections, even though the server
+process is still running.
+
+When a container has a healthcheck specified, it has a _health status_ in
+addition to its normal status. This status is initially `starting`. Whenever a
+health check passes, it becomes `healthy` (whatever state it was previously in).
+After a certain number of consecutive failures, it becomes `unhealthy`.
+
+The options that can appear before `CMD` are:
+
+* `--interval=DURATION` (default: `30s`)
+* `--timeout=DURATION` (default: `30s`)
+* `--retries=N` (default: `1`)
+
+The health check will first run **interval** seconds after the container is
+started, and then again **interval** seconds after each previous check completes.
+
+If a single run of the check takes longer than **timeout** seconds then the check
+is considered to have failed.
+
+It takes **retries** consecutive failures of the health check for the container
+to be considered `unhealthy`.
+
+There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list
+more than one then only the last `HEALTHCHECK` will take effect.
+
+The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK
+CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands;
+see e.g. `ENTRYPOINT` for details).
+
+The command's exit status indicates the health status of the container.
+The possible values are:
+
+- 0: success - the container is healthy and ready for use
+- 1: unhealthy - the container is not working correctly
+- 2: starting - the container is not ready for use yet, but is working correctly
+
+If the probe returns 2 ("starting") when the container has already moved out of the
+"starting" state then it is treated as "unhealthy" instead.
+
+For example, to check every five minutes or so that a web-server is able to
+serve the site's main page within three seconds:
+
+    HEALTHCHECK --interval=5m --timeout=3s \
+      CMD curl -f http://localhost/ || exit 1
+
+To help debug failing probes, any output text (UTF-8 encoded) that the command writes
+on stdout or stderr will be stored in the health status and can be queried with
+`docker inspect`. Such output should be kept short (only the first 4096 bytes
+are stored currently).
+
+When the health status of a container changes, a `health_status` event is
+generated with the new status.
+
+The `HEALTHCHECK` feature was added in Docker 1.12.
+
+
+
 ## Dockerfile examples
 
 Below you can see some examples of Dockerfile syntax. If you're interested in

+ 60 - 0
docs/reference/run.md

@@ -1250,6 +1250,7 @@ Dockerfile instruction and how the operator can override that setting.
     #entrypoint-default-command-to-execute-at-runtime)
  - [EXPOSE (Incoming Ports)](#expose-incoming-ports)
  - [ENV (Environment Variables)](#env-environment-variables)
+ - [HEALTHCHECK](#healthcheck)
  - [VOLUME (Shared Filesystems)](#volume-shared-filesystems)
  - [USER](#user)
  - [WORKDIR](#workdir)
@@ -1398,6 +1399,65 @@ above, or already defined by the developer with a Dockerfile `ENV`:
 
 Similarly the operator can set the **hostname** with `-h`.
 
+### HEALTHCHECK
+
+```
+  --health-cmd            Command to run to check health
+  --health-interval       Time between running the check
+  --health-retries        Consecutive failures needed to report unhealthy
+  --health-timeout        Maximum time to allow one check to run
+  --no-healthcheck        Disable any container-specified HEALTHCHECK
+```
+
+Example:
+
+    $ docker run --name=test -d \
+        --health-cmd='stat /etc/passwd || exit 1' \
+        --health-interval=2s \
+        busybox sleep 1d
+    $ sleep 2; docker inspect --format='{{.State.Health.Status}}' test
+    healthy
+    $ docker exec test rm /etc/passwd
+    $ sleep 2; docker inspect --format='{{json .State.Health}}' test
+    {
+      "Status": "unhealthy",
+      "FailingStreak": 3,
+      "Log": [
+        {
+          "Start": "2016-05-25T17:22:04.635478668Z",
+          "End": "2016-05-25T17:22:04.7272552Z",
+          "ExitCode": 0,
+          "Output": "  File: /etc/passwd\n  Size: 334       \tBlocks: 8          IO Block: 4096   regular file\nDevice: 32h/50d\tInode: 12          Links: 1\nAccess: (0664/-rw-rw-r--)  Uid: (    0/    root)   Gid: (    0/    root)\nAccess: 2015-12-05 22:05:32.000000000\nModify: 2015..."
+        },
+        {
+          "Start": "2016-05-25T17:22:06.732900633Z",
+          "End": "2016-05-25T17:22:06.822168935Z",
+          "ExitCode": 0,
+          "Output": "  File: /etc/passwd\n  Size: 334       \tBlocks: 8          IO Block: 4096   regular file\nDevice: 32h/50d\tInode: 12          Links: 1\nAccess: (0664/-rw-rw-r--)  Uid: (    0/    root)   Gid: (    0/    root)\nAccess: 2015-12-05 22:05:32.000000000\nModify: 2015..."
+        },
+        {
+          "Start": "2016-05-25T17:22:08.823956535Z",
+          "End": "2016-05-25T17:22:08.897359124Z",
+          "ExitCode": 1,
+          "Output": "stat: can't stat '/etc/passwd': No such file or directory\n"
+        },
+        {
+          "Start": "2016-05-25T17:22:10.898802931Z",
+          "End": "2016-05-25T17:22:10.969631866Z",
+          "ExitCode": 1,
+          "Output": "stat: can't stat '/etc/passwd': No such file or directory\n"
+        },
+        {
+          "Start": "2016-05-25T17:22:12.971033523Z",
+          "End": "2016-05-25T17:22:13.082015516Z",
+          "ExitCode": 1,
+          "Output": "stat: can't stat '/etc/passwd': No such file or directory\n"
+        }
+      ]
+    }
+
+The health status is also displayed in the `docker ps` output.
+
 ### TMPFS (mount tmpfs filesystems)
 
 ```bash

+ 154 - 0
integration-cli/docker_cli_health_test.go

@@ -0,0 +1,154 @@
+package main
+
+import (
+	"encoding/json"
+	"github.com/docker/docker/pkg/integration/checker"
+	"github.com/docker/engine-api/types"
+	"github.com/go-check/check"
+	"strconv"
+	"strings"
+	"time"
+)
+
+func waitForStatus(c *check.C, name string, prev string, expected string) {
+	prev = prev + "\n"
+	expected = expected + "\n"
+	for {
+		out, _ := dockerCmd(c, "inspect", "--format={{.State.Status}}", name)
+		if out == expected {
+			return
+		}
+		c.Check(out, checker.Equals, prev)
+		if out != prev {
+			return
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+}
+
+func waitForHealthStatus(c *check.C, name string, prev string, expected string) {
+	prev = prev + "\n"
+	expected = expected + "\n"
+	for {
+		out, _ := dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", name)
+		if out == expected {
+			return
+		}
+		c.Check(out, checker.Equals, prev)
+		if out != prev {
+			return
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+}
+
+func getHealth(c *check.C, name string) *types.Health {
+	out, _ := dockerCmd(c, "inspect", "--format={{json .State.Health}}", name)
+	var health types.Health
+	err := json.Unmarshal([]byte(out), &health)
+	c.Check(err, checker.Equals, nil)
+	return &health
+}
+
+func (s *DockerSuite) TestHealth(c *check.C) {
+	testRequires(c, DaemonIsLinux) // busybox doesn't work on Windows
+
+	imageName := "testhealth"
+	_, err := buildImage(imageName,
+		`FROM busybox
+		RUN echo OK > /status
+		CMD ["/bin/sleep", "120"]
+		STOPSIGNAL SIGKILL
+		HEALTHCHECK --interval=1s --timeout=30s \
+		  CMD cat /status`,
+		true)
+
+	c.Check(err, check.IsNil)
+
+	// No health status before starting
+	name := "test_health"
+	dockerCmd(c, "create", "--name", name, imageName)
+	out, _ := dockerCmd(c, "ps", "-a", "--format={{.Status}}")
+	c.Check(out, checker.Equals, "Created\n")
+
+	// Inspect the options
+	out, _ = dockerCmd(c, "inspect",
+		"--format='timeout={{.Config.Healthcheck.Timeout}} "+
+			"interval={{.Config.Healthcheck.Interval}} "+
+			"retries={{.Config.Healthcheck.Retries}} "+
+			"test={{.Config.Healthcheck.Test}}'", name)
+	c.Check(out, checker.Equals, "timeout=30s interval=1s retries=0 test=[CMD-SHELL cat /status]\n")
+
+	// Start
+	dockerCmd(c, "start", name)
+	waitForHealthStatus(c, name, "starting", "healthy")
+
+	// Make it fail
+	dockerCmd(c, "exec", name, "rm", "/status")
+	waitForHealthStatus(c, name, "healthy", "unhealthy")
+
+	// Inspect the status
+	out, _ = dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", name)
+	c.Check(out, checker.Equals, "unhealthy\n")
+
+	// Make it healthy again
+	dockerCmd(c, "exec", name, "touch", "/status")
+	waitForHealthStatus(c, name, "unhealthy", "healthy")
+
+	// Remove container
+	dockerCmd(c, "rm", "-f", name)
+
+	// Disable the check from the CLI
+	out, _ = dockerCmd(c, "create", "--name=noh", "--no-healthcheck", imageName)
+	out, _ = dockerCmd(c, "inspect", "--format={{.Config.Healthcheck.Test}}", "noh")
+	c.Check(out, checker.Equals, "[NONE]\n")
+	dockerCmd(c, "rm", "noh")
+
+	// Disable the check with a new build
+	_, err = buildImage("no_healthcheck",
+		`FROM testhealth
+		HEALTHCHECK NONE`, true)
+	c.Check(err, check.IsNil)
+
+	out, _ = dockerCmd(c, "inspect", "--format={{.ContainerConfig.Healthcheck.Test}}", "no_healthcheck")
+	c.Check(out, checker.Equals, "[NONE]\n")
+
+	// Enable the checks from the CLI
+	_, _ = dockerCmd(c, "run", "-d", "--name=fatal_healthcheck",
+		"--health-interval=0.5s",
+		"--health-retries=3",
+		"--health-cmd=cat /status",
+		"no_healthcheck")
+	waitForHealthStatus(c, "fatal_healthcheck", "starting", "healthy")
+	health := getHealth(c, "fatal_healthcheck")
+	c.Check(health.Status, checker.Equals, "healthy")
+	c.Check(health.FailingStreak, checker.Equals, 0)
+	last := health.Log[len(health.Log)-1]
+	c.Check(last.ExitCode, checker.Equals, 0)
+	c.Check(last.Output, checker.Equals, "OK\n")
+
+	// Fail the check, which should now make it exit
+	dockerCmd(c, "exec", "fatal_healthcheck", "rm", "/status")
+	waitForStatus(c, "fatal_healthcheck", "running", "exited")
+
+	out, _ = dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", "fatal_healthcheck")
+	c.Check(out, checker.Equals, "unhealthy\n")
+	failsStr, _ := dockerCmd(c, "inspect", "--format={{.State.Health.FailingStreak}}", "fatal_healthcheck")
+	fails, err := strconv.Atoi(strings.TrimSpace(failsStr))
+	c.Check(err, check.IsNil)
+	c.Check(fails >= 3, checker.Equals, true)
+	dockerCmd(c, "rm", "-f", "fatal_healthcheck")
+
+	// Check timeout
+	// Note: if the interval is too small, it seems that Docker spends all its time running health
+	// checks and never gets around to killing it.
+	_, _ = dockerCmd(c, "run", "-d", "--name=test",
+		"--health-interval=1s", "--health-cmd=sleep 5m", "--health-timeout=1ms", imageName)
+	waitForHealthStatus(c, "test", "starting", "unhealthy")
+	health = getHealth(c, "test")
+	last = health.Log[len(health.Log)-1]
+	c.Check(health.Status, checker.Equals, "unhealthy")
+	c.Check(last.ExitCode, checker.Equals, -1)
+	c.Check(last.Output, checker.Equals, "Health check exceeded timeout (1ms)")
+	dockerCmd(c, "rm", "-f", "test")
+}

+ 11 - 0
libcontainerd/client_linux.go

@@ -190,6 +190,17 @@ func (clnt *client) Signal(containerID string, sig int) error {
 	return err
 }
 
+func (clnt *client) SignalProcess(containerID string, pid string, sig int) error {
+	clnt.lock(containerID)
+	defer clnt.unlock(containerID)
+	_, err := clnt.remote.apiClient.Signal(context.Background(), &containerd.SignalRequest{
+		Id:     containerID,
+		Pid:    pid,
+		Signal: uint32(sig),
+	})
+	return err
+}
+
 func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error {
 	clnt.lock(containerID)
 	defer clnt.unlock(containerID)

+ 19 - 0
libcontainerd/client_windows.go

@@ -304,6 +304,25 @@ func (clnt *client) Signal(containerID string, sig int) error {
 	return nil
 }
 
+// While Linux has support for the full range of signals, signals aren't really implemented on Windows.
+// We try to terminate the specified process whatever signal is requested.
+func (clnt *client) SignalProcess(containerID string, processFriendlyName string, sig int) error {
+	clnt.lock(containerID)
+	defer clnt.unlock(containerID)
+	cont, err := clnt.getContainer(containerID)
+	if err != nil {
+		return err
+	}
+
+	for _, p := range cont.processes {
+		if p.friendlyName == processFriendlyName {
+			return hcsshim.TerminateProcessInComputeSystem(containerID, p.systemPid)
+		}
+	}
+
+	return fmt.Errorf("SignalProcess could not find process %s in %s", processFriendlyName, containerID)
+}
+
 // Resize handles a CLI event to resize an interactive docker run or docker exec
 // window.
 func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error {

+ 1 - 0
libcontainerd/types.go

@@ -34,6 +34,7 @@ type Backend interface {
 type Client interface {
 	Create(containerID string, spec Spec, options ...CreateOption) error
 	Signal(containerID string, sig int) error
+	SignalProcess(containerID string, processFriendlyName string, sig int) error
 	AddProcess(containerID, processFriendlyName string, process Process) error
 	Resize(containerID, processFriendlyName string, width, height int) error
 	Pause(containerID string) error

+ 40 - 0
runconfig/opts/parse.go

@@ -100,6 +100,12 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
 		flStopSignal        = cmd.String([]string{"-stop-signal"}, signal.DefaultStopSignal, fmt.Sprintf("Signal to stop a container, %v by default", signal.DefaultStopSignal))
 		flIsolation         = cmd.String([]string{"-isolation"}, "", "Container isolation technology")
 		flShmSize           = cmd.String([]string{"-shm-size"}, "", "Size of /dev/shm, default value is 64MB")
+		// Healthcheck
+		flNoHealthcheck  = cmd.Bool([]string{"-no-healthcheck"}, false, "Disable any container-specified HEALTHCHECK")
+		flHealthCmd      = cmd.String([]string{"-health-cmd"}, "", "Command to run to check health")
+		flHealthInterval = cmd.Duration([]string{"-health-interval"}, 0, "Time between running the check")
+		flHealthTimeout  = cmd.Duration([]string{"-health-timeout"}, 0, "Maximum time to allow one check to run")
+		flHealthRetries  = cmd.Int([]string{"-health-retries"}, 0, "Consecutive failures needed to report unhealthy")
 	)
 
 	cmd.Var(&flAttach, []string{"a", "-attach"}, "Attach to STDIN, STDOUT or STDERR")
@@ -351,6 +357,39 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
 		return nil, nil, nil, cmd, err
 	}
 
+	// Healthcheck
+	var healthConfig *container.HealthConfig
+	haveHealthSettings := *flHealthCmd != "" ||
+		*flHealthInterval != 0 ||
+		*flHealthTimeout != 0 ||
+		*flHealthRetries != 0
+	if *flNoHealthcheck {
+		if haveHealthSettings {
+			return nil, nil, nil, cmd, fmt.Errorf("--no-healthcheck conflicts with --health-* options")
+		}
+		test := strslice.StrSlice{"NONE"}
+		healthConfig = &container.HealthConfig{Test: test}
+	} else if haveHealthSettings {
+		var probe strslice.StrSlice
+		if *flHealthCmd != "" {
+			args := []string{"CMD-SHELL", *flHealthCmd}
+			probe = strslice.StrSlice(args)
+		}
+		if *flHealthInterval < 0 {
+			return nil, nil, nil, cmd, fmt.Errorf("--health-interval cannot be negative")
+		}
+		if *flHealthTimeout < 0 {
+			return nil, nil, nil, cmd, fmt.Errorf("--health-timeout cannot be negative")
+		}
+
+		healthConfig = &container.HealthConfig{
+			Test:     probe,
+			Interval: *flHealthInterval,
+			Timeout:  *flHealthTimeout,
+			Retries:  *flHealthRetries,
+		}
+	}
+
 	resources := container.Resources{
 		CgroupParent:         *flCgroupParent,
 		Memory:               flMemory,
@@ -399,6 +438,7 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
 		Entrypoint:      entrypoint,
 		WorkingDir:      *flWorkingDir,
 		Labels:          ConvertKVStringsToMap(labels),
+		Healthcheck:     healthConfig,
 	}
 	if cmd.IsSet("-stop-signal") {
 		config.StopSignal = *flStopSignal

+ 40 - 0
runconfig/opts/parse_test.go

@@ -9,6 +9,7 @@ import (
 	"runtime"
 	"strings"
 	"testing"
+	"time"
 
 	flag "github.com/docker/docker/pkg/mflag"
 	"github.com/docker/docker/runconfig"
@@ -584,6 +585,45 @@ func TestParseRestartPolicy(t *testing.T) {
 	}
 }
 
+func TestParseHealth(t *testing.T) {
+	checkOk := func(args ...string) *container.HealthConfig {
+		config, _, _, _, err := parseRun(args)
+		if err != nil {
+			t.Fatalf("%#v: %v", args, err)
+		}
+		return config.Healthcheck
+	}
+	checkError := func(expected string, args ...string) {
+		config, _, _, _, err := parseRun(args)
+		if err == nil {
+			t.Fatalf("Expected error, but got %#v", config)
+		}
+		if err.Error() != expected {
+			t.Fatalf("Expected %#v, got %#v", expected, err)
+		}
+	}
+	health := checkOk("--no-healthcheck", "img", "cmd")
+	if health == nil || len(health.Test) != 1 || health.Test[0] != "NONE" {
+		t.Fatalf("--no-healthcheck failed: %#v", health)
+	}
+
+	health = checkOk("--health-cmd=/check.sh -q", "img", "cmd")
+	if len(health.Test) != 2 || health.Test[0] != "CMD-SHELL" || health.Test[1] != "/check.sh -q" {
+		t.Fatalf("--health-cmd: got %#v", health.Test)
+	}
+	if health.Timeout != 0 {
+		t.Fatalf("--health-cmd: timeout = %f", health.Timeout)
+	}
+
+	checkError("--no-healthcheck conflicts with --health-* options",
+		"--no-healthcheck", "--health-cmd=/check.sh -q", "img", "cmd")
+
+	health = checkOk("--health-timeout=2s", "--health-retries=3", "--health-interval=4.5s", "img", "cmd")
+	if health.Timeout != 2*time.Second || health.Retries != 3 || health.Interval != 4500*time.Millisecond {
+		t.Fatalf("--health-*: got %#v", health)
+	}
+}
+
 func TestParseLoggingOpts(t *testing.T) {
 	// logging opts ko
 	if _, _, _, _, err := parseRun([]string{"--log-driver=none", "--log-opt=anything", "img", "cmd"}); err == nil || err.Error() != "invalid logging opts for driver none" {