Browse Source

Add support for user-defined healthchecks

This PR adds support for user-defined health-check probes for Docker
containers. It adds a `HEALTHCHECK` instruction to the Dockerfile syntax plus
some corresponding "docker run" options. It can be used with a restart policy
to automatically restart a container if the check fails.

The `HEALTHCHECK` instruction has two forms:

* `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container)
* `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image)

The `HEALTHCHECK` instruction tells Docker how to test a container to check that
it is still working. This can detect cases such as a web server that is stuck in
an infinite loop and unable to handle new connections, even though the server
process is still running.

When a container has a healthcheck specified, it has a _health status_ in
addition to its normal status. This status is initially `starting`. Whenever a
health check passes, it becomes `healthy` (whatever state it was previously in).
After a certain number of consecutive failures, it becomes `unhealthy`.

The options that can appear before `CMD` are:

* `--interval=DURATION` (default: `30s`)
* `--timeout=DURATION` (default: `30s`)
* `--retries=N` (default: `1`)

The health check will first run **interval** seconds after the container is
started, and then again **interval** seconds after each previous check completes.

If a single run of the check takes longer than **timeout** seconds then the check
is considered to have failed.

It takes **retries** consecutive failures of the health check for the container
to be considered `unhealthy`.

There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list
more than one then only the last `HEALTHCHECK` will take effect.

The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK
CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands;
see e.g. `ENTRYPOINT` for details).

The command's exit status indicates the health status of the container.
The possible values are:

- 0: success - the container is healthy and ready for use
- 1: unhealthy - the container is not working correctly
- 2: starting - the container is not ready for use yet, but is working correctly

If the probe returns 2 ("starting") when the container has already moved out of the
"starting" state then it is treated as "unhealthy" instead.

For example, to check every five minutes or so that a web-server is able to
serve the site's main page within three seconds:

    HEALTHCHECK --interval=5m --timeout=3s \
      CMD curl -f http://localhost/ || exit 1

To help debug failing probes, any output text (UTF-8 encoded) that the command writes
on stdout or stderr will be stored in the health status and can be queried with
`docker inspect`. Such output should be kept short (only the first 4096 bytes
are stored currently).

When the health status of a container changes, a `health_status` event is
generated with the new status. The health status is also displayed in the
`docker ps` output.

Signed-off-by: Thomas Leonard <thomas.leonard@docker.com>
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
Thomas Leonard 9 years ago
parent
commit
b6c7becbfe

+ 1 - 1
api/server/router/container/backend.go

@@ -17,7 +17,7 @@ type execBackend interface {
 	ContainerExecCreate(name string, config *types.ExecConfig) (string, error)
 	ContainerExecCreate(name string, config *types.ExecConfig) (string, error)
 	ContainerExecInspect(id string) (*backend.ExecInspect, error)
 	ContainerExecInspect(id string) (*backend.ExecInspect, error)
 	ContainerExecResize(name string, height, width int) error
 	ContainerExecResize(name string, height, width int) error
-	ContainerExecStart(name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) error
+	ContainerExecStart(ctx context.Context, name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) error
 	ExecExists(name string) (bool, error)
 	ExecExists(name string) (bool, error)
 }
 }
 
 

+ 2 - 1
api/server/router/container/exec.go

@@ -106,7 +106,8 @@ func (s *containerRouter) postContainerExecStart(ctx context.Context, w http.Res
 	}
 	}
 
 
 	// Now run the user process in container.
 	// Now run the user process in container.
-	if err := s.backend.ContainerExecStart(execName, stdin, stdout, stderr); err != nil {
+	// Maybe we should we pass ctx here if we're not detaching?
+	if err := s.backend.ContainerExecStart(context.Background(), execName, stdin, stdout, stderr); err != nil {
 		if execStartCheck.Detach {
 		if execStartCheck.Detach {
 			return err
 			return err
 		}
 		}

+ 10 - 9
builder/dockerfile/builder.go

@@ -22,15 +22,16 @@ import (
 )
 )
 
 
 var validCommitCommands = map[string]bool{
 var validCommitCommands = map[string]bool{
-	"cmd":        true,
-	"entrypoint": true,
-	"env":        true,
-	"expose":     true,
-	"label":      true,
-	"onbuild":    true,
-	"user":       true,
-	"volume":     true,
-	"workdir":    true,
+	"cmd":         true,
+	"entrypoint":  true,
+	"healthcheck": true,
+	"env":         true,
+	"expose":      true,
+	"label":       true,
+	"onbuild":     true,
+	"user":        true,
+	"volume":      true,
+	"workdir":     true,
 }
 }
 
 
 // BuiltinAllowedBuildArgs is list of built-in allowed build args
 // BuiltinAllowedBuildArgs is list of built-in allowed build args

+ 34 - 32
builder/dockerfile/command/command.go

@@ -3,40 +3,42 @@ package command
 
 
 // Define constants for the command strings
 // Define constants for the command strings
 const (
 const (
-	Env        = "env"
-	Label      = "label"
-	Maintainer = "maintainer"
-	Add        = "add"
-	Copy       = "copy"
-	From       = "from"
-	Onbuild    = "onbuild"
-	Workdir    = "workdir"
-	Run        = "run"
-	Cmd        = "cmd"
-	Entrypoint = "entrypoint"
-	Expose     = "expose"
-	Volume     = "volume"
-	User       = "user"
-	StopSignal = "stopsignal"
-	Arg        = "arg"
+	Env         = "env"
+	Label       = "label"
+	Maintainer  = "maintainer"
+	Add         = "add"
+	Copy        = "copy"
+	From        = "from"
+	Onbuild     = "onbuild"
+	Workdir     = "workdir"
+	Run         = "run"
+	Cmd         = "cmd"
+	Entrypoint  = "entrypoint"
+	Expose      = "expose"
+	Volume      = "volume"
+	User        = "user"
+	StopSignal  = "stopsignal"
+	Arg         = "arg"
+	Healthcheck = "healthcheck"
 )
 )
 
 
 // Commands is list of all Dockerfile commands
 // Commands is list of all Dockerfile commands
 var Commands = map[string]struct{}{
 var Commands = map[string]struct{}{
-	Env:        {},
-	Label:      {},
-	Maintainer: {},
-	Add:        {},
-	Copy:       {},
-	From:       {},
-	Onbuild:    {},
-	Workdir:    {},
-	Run:        {},
-	Cmd:        {},
-	Entrypoint: {},
-	Expose:     {},
-	Volume:     {},
-	User:       {},
-	StopSignal: {},
-	Arg:        {},
+	Env:         {},
+	Label:       {},
+	Maintainer:  {},
+	Add:         {},
+	Copy:        {},
+	From:        {},
+	Onbuild:     {},
+	Workdir:     {},
+	Run:         {},
+	Cmd:         {},
+	Entrypoint:  {},
+	Expose:      {},
+	Volume:      {},
+	User:        {},
+	StopSignal:  {},
+	Arg:         {},
+	Healthcheck: {},
 }
 }

+ 107 - 0
builder/dockerfile/dispatchers.go

@@ -12,7 +12,9 @@ import (
 	"regexp"
 	"regexp"
 	"runtime"
 	"runtime"
 	"sort"
 	"sort"
+	"strconv"
 	"strings"
 	"strings"
+	"time"
 
 
 	"github.com/Sirupsen/logrus"
 	"github.com/Sirupsen/logrus"
 	"github.com/docker/docker/api"
 	"github.com/docker/docker/api"
@@ -426,6 +428,111 @@ func cmd(b *Builder, args []string, attributes map[string]bool, original string)
 	return nil
 	return nil
 }
 }
 
 
+// parseOptInterval(flag) is the duration of flag.Value, or 0 if
+// empty. An error is reported if the value is given and is not positive.
+func parseOptInterval(f *Flag) (time.Duration, error) {
+	s := f.Value
+	if s == "" {
+		return 0, nil
+	}
+	d, err := time.ParseDuration(s)
+	if err != nil {
+		return 0, err
+	}
+	if d <= 0 {
+		return 0, fmt.Errorf("Interval %#v must be positive", f.name)
+	}
+	return d, nil
+}
+
+// HEALTHCHECK foo
+//
+// Set the default healthcheck command to run in the container (which may be empty).
+// Argument handling is the same as RUN.
+//
+func healthcheck(b *Builder, args []string, attributes map[string]bool, original string) error {
+	if len(args) == 0 {
+		return fmt.Errorf("HEALTHCHECK requires an argument")
+	}
+	typ := strings.ToUpper(args[0])
+	args = args[1:]
+	if typ == "NONE" {
+		if len(args) != 0 {
+			return fmt.Errorf("HEALTHCHECK NONE takes no arguments")
+		}
+		test := strslice.StrSlice{typ}
+		b.runConfig.Healthcheck = &container.HealthConfig{
+			Test: test,
+		}
+	} else {
+		if b.runConfig.Healthcheck != nil {
+			oldCmd := b.runConfig.Healthcheck.Test
+			if len(oldCmd) > 0 && oldCmd[0] != "NONE" {
+				fmt.Fprintf(b.Stdout, "Note: overriding previous HEALTHCHECK: %v\n", oldCmd)
+			}
+		}
+
+		healthcheck := container.HealthConfig{}
+
+		flInterval := b.flags.AddString("interval", "")
+		flTimeout := b.flags.AddString("timeout", "")
+		flRetries := b.flags.AddString("retries", "")
+
+		if err := b.flags.Parse(); err != nil {
+			return err
+		}
+
+		switch typ {
+		case "CMD":
+			cmdSlice := handleJSONArgs(args, attributes)
+			if len(cmdSlice) == 0 {
+				return fmt.Errorf("Missing command after HEALTHCHECK CMD")
+			}
+
+			if !attributes["json"] {
+				typ = "CMD-SHELL"
+			}
+
+			healthcheck.Test = strslice.StrSlice(append([]string{typ}, cmdSlice...))
+		default:
+			return fmt.Errorf("Unknown type %#v in HEALTHCHECK (try CMD)", typ)
+		}
+
+		interval, err := parseOptInterval(flInterval)
+		if err != nil {
+			return err
+		}
+		healthcheck.Interval = interval
+
+		timeout, err := parseOptInterval(flTimeout)
+		if err != nil {
+			return err
+		}
+		healthcheck.Timeout = timeout
+
+		if flRetries.Value != "" {
+			retries, err := strconv.ParseInt(flRetries.Value, 10, 32)
+			if err != nil {
+				return err
+			}
+			if retries < 1 {
+				return fmt.Errorf("--retries must be at least 1 (not %d)", retries)
+			}
+			healthcheck.Retries = int(retries)
+		} else {
+			healthcheck.Retries = 0
+		}
+
+		b.runConfig.Healthcheck = &healthcheck
+	}
+
+	if err := b.commit("", b.runConfig.Cmd, fmt.Sprintf("HEALTHCHECK %q", b.runConfig.Healthcheck)); err != nil {
+		return err
+	}
+
+	return nil
+}
+
 // ENTRYPOINT /usr/sbin/nginx
 // ENTRYPOINT /usr/sbin/nginx
 //
 //
 // Set the entrypoint (which defaults to sh -c on linux, or cmd /S /C on Windows) to
 // Set the entrypoint (which defaults to sh -c on linux, or cmd /S /C on Windows) to

+ 17 - 16
builder/dockerfile/evaluator.go

@@ -58,22 +58,23 @@ var evaluateTable map[string]func(*Builder, []string, map[string]bool, string) e
 
 
 func init() {
 func init() {
 	evaluateTable = map[string]func(*Builder, []string, map[string]bool, string) error{
 	evaluateTable = map[string]func(*Builder, []string, map[string]bool, string) error{
-		command.Env:        env,
-		command.Label:      label,
-		command.Maintainer: maintainer,
-		command.Add:        add,
-		command.Copy:       dispatchCopy, // copy() is a go builtin
-		command.From:       from,
-		command.Onbuild:    onbuild,
-		command.Workdir:    workdir,
-		command.Run:        run,
-		command.Cmd:        cmd,
-		command.Entrypoint: entrypoint,
-		command.Expose:     expose,
-		command.Volume:     volume,
-		command.User:       user,
-		command.StopSignal: stopSignal,
-		command.Arg:        arg,
+		command.Env:         env,
+		command.Label:       label,
+		command.Maintainer:  maintainer,
+		command.Add:         add,
+		command.Copy:        dispatchCopy, // copy() is a go builtin
+		command.From:        from,
+		command.Onbuild:     onbuild,
+		command.Workdir:     workdir,
+		command.Run:         run,
+		command.Cmd:         cmd,
+		command.Entrypoint:  entrypoint,
+		command.Expose:      expose,
+		command.Volume:      volume,
+		command.User:        user,
+		command.StopSignal:  stopSignal,
+		command.Arg:         arg,
+		command.Healthcheck: healthcheck,
 	}
 	}
 }
 }
 
 

+ 29 - 0
builder/dockerfile/parser/line_parsers.go

@@ -329,3 +329,32 @@ func parseMaybeJSONToList(rest string) (*Node, map[string]bool, error) {
 
 
 	return parseStringsWhitespaceDelimited(rest)
 	return parseStringsWhitespaceDelimited(rest)
 }
 }
+
+// The HEALTHCHECK command is like parseMaybeJSON, but has an extra type argument.
+func parseHealthConfig(rest string) (*Node, map[string]bool, error) {
+	// Find end of first argument
+	var sep int
+	for ; sep < len(rest); sep++ {
+		if unicode.IsSpace(rune(rest[sep])) {
+			break
+		}
+	}
+	next := sep
+	for ; next < len(rest); next++ {
+		if !unicode.IsSpace(rune(rest[next])) {
+			break
+		}
+	}
+
+	if sep == 0 {
+		return nil, nil, nil
+	}
+
+	typ := rest[:sep]
+	cmd, attrs, err := parseMaybeJSON(rest[next:])
+	if err != nil {
+		return nil, nil, err
+	}
+
+	return &Node{Value: typ, Next: cmd, Attributes: attrs}, nil, err
+}

+ 17 - 16
builder/dockerfile/parser/parser.go

@@ -66,22 +66,23 @@ func init() {
 	// functions. Errors are propagated up by Parse() and the resulting AST can
 	// functions. Errors are propagated up by Parse() and the resulting AST can
 	// be incorporated directly into the existing AST as a next.
 	// be incorporated directly into the existing AST as a next.
 	dispatch = map[string]func(string) (*Node, map[string]bool, error){
 	dispatch = map[string]func(string) (*Node, map[string]bool, error){
-		command.User:       parseString,
-		command.Onbuild:    parseSubCommand,
-		command.Workdir:    parseString,
-		command.Env:        parseEnv,
-		command.Label:      parseLabel,
-		command.Maintainer: parseString,
-		command.From:       parseString,
-		command.Add:        parseMaybeJSONToList,
-		command.Copy:       parseMaybeJSONToList,
-		command.Run:        parseMaybeJSON,
-		command.Cmd:        parseMaybeJSON,
-		command.Entrypoint: parseMaybeJSON,
-		command.Expose:     parseStringsWhitespaceDelimited,
-		command.Volume:     parseMaybeJSONToList,
-		command.StopSignal: parseString,
-		command.Arg:        parseNameOrNameVal,
+		command.User:        parseString,
+		command.Onbuild:     parseSubCommand,
+		command.Workdir:     parseString,
+		command.Env:         parseEnv,
+		command.Label:       parseLabel,
+		command.Maintainer:  parseString,
+		command.From:        parseString,
+		command.Add:         parseMaybeJSONToList,
+		command.Copy:        parseMaybeJSONToList,
+		command.Run:         parseMaybeJSON,
+		command.Cmd:         parseMaybeJSON,
+		command.Entrypoint:  parseMaybeJSON,
+		command.Expose:      parseStringsWhitespaceDelimited,
+		command.Volume:      parseMaybeJSONToList,
+		command.StopSignal:  parseString,
+		command.Arg:         parseNameOrNameVal,
+		command.Healthcheck: parseHealthConfig,
 	}
 	}
 }
 }
 
 

+ 10 - 0
builder/dockerfile/parser/testfiles/health/Dockerfile

@@ -0,0 +1,10 @@
+FROM debian
+ADD check.sh main.sh /app/
+CMD /app/main.sh
+HEALTHCHECK
+HEALTHCHECK --interval=5s --timeout=3s --retries=1 \
+  CMD /app/check.sh --quiet
+HEALTHCHECK CMD
+HEALTHCHECK   CMD   a b
+HEALTHCHECK --timeout=3s CMD ["foo"]
+HEALTHCHECK CONNECT TCP 7000

+ 9 - 0
builder/dockerfile/parser/testfiles/health/result

@@ -0,0 +1,9 @@
+(from "debian")
+(add "check.sh" "main.sh" "/app/")
+(cmd "/app/main.sh")
+(healthcheck)
+(healthcheck ["--interval=5s" "--timeout=3s" "--retries=1"] "CMD" "/app/check.sh --quiet")
+(healthcheck "CMD")
+(healthcheck "CMD" "a b")
+(healthcheck ["--timeout=3s"] "CMD" "foo")
+(healthcheck "CONNECT" "TCP 7000")

+ 49 - 0
container/health.go

@@ -0,0 +1,49 @@
+package container
+
+import (
+	"github.com/Sirupsen/logrus"
+	"github.com/docker/engine-api/types"
+)
+
+// Health holds the current container health-check state
+type Health struct {
+	types.Health
+	stop chan struct{} // Write struct{} to stop the monitor
+}
+
+// String returns a human-readable description of the health-check state
+func (s *Health) String() string {
+	if s.stop == nil {
+		return "no healthcheck"
+	}
+	switch s.Status {
+	case types.Starting:
+		return "health: starting"
+	default: // Healthy and Unhealthy are clear on their own
+		return s.Status
+	}
+}
+
+// OpenMonitorChannel creates and returns a new monitor channel. If there already is one,
+// it returns nil.
+func (s *Health) OpenMonitorChannel() chan struct{} {
+	if s.stop == nil {
+		logrus.Debugf("OpenMonitorChannel")
+		s.stop = make(chan struct{})
+		return s.stop
+	}
+	return nil
+}
+
+// CloseMonitorChannel closes any existing monitor channel.
+func (s *Health) CloseMonitorChannel() {
+	if s.stop != nil {
+		logrus.Debugf("CloseMonitorChannel: waiting for probe to stop")
+		// This channel does not buffer. Once the write succeeds, the monitor
+		// has read the stop request and will not make any further updates
+		// to c.State.Health.
+		s.stop <- struct{}{}
+		s.stop = nil
+		logrus.Debugf("CloseMonitorChannel done")
+	}
+}

+ 4 - 0
container/state.go

@@ -27,6 +27,7 @@ type State struct {
 	StartedAt         time.Time
 	StartedAt         time.Time
 	FinishedAt        time.Time
 	FinishedAt        time.Time
 	waitChan          chan struct{}
 	waitChan          chan struct{}
+	Health            *Health
 }
 }
 
 
 // NewState creates a default state object with a fresh channel for state changes.
 // NewState creates a default state object with a fresh channel for state changes.
@@ -46,6 +47,9 @@ func (s *State) String() string {
 			return fmt.Sprintf("Restarting (%d) %s ago", s.ExitCode, units.HumanDuration(time.Now().UTC().Sub(s.FinishedAt)))
 			return fmt.Sprintf("Restarting (%d) %s ago", s.ExitCode, units.HumanDuration(time.Now().UTC().Sub(s.FinishedAt)))
 		}
 		}
 
 
+		if h := s.Health; h != nil {
+			return fmt.Sprintf("Up %s (%s)", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)), h.String())
+		}
 		return fmt.Sprintf("Up %s", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)))
 		return fmt.Sprintf("Up %s", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)))
 	}
 	}
 
 

+ 19 - 0
daemon/commit.go

@@ -80,6 +80,25 @@ func merge(userConf, imageConf *containertypes.Config) error {
 			userConf.Entrypoint = imageConf.Entrypoint
 			userConf.Entrypoint = imageConf.Entrypoint
 		}
 		}
 	}
 	}
+	if imageConf.Healthcheck != nil {
+		if userConf.Healthcheck == nil {
+			userConf.Healthcheck = imageConf.Healthcheck
+		} else {
+			if len(userConf.Healthcheck.Test) == 0 {
+				userConf.Healthcheck.Test = imageConf.Healthcheck.Test
+			}
+			if userConf.Healthcheck.Interval == 0 {
+				userConf.Healthcheck.Interval = imageConf.Healthcheck.Interval
+			}
+			if userConf.Healthcheck.Timeout == 0 {
+				userConf.Healthcheck.Timeout = imageConf.Healthcheck.Timeout
+			}
+			if userConf.Healthcheck.Retries == 0 {
+				userConf.Healthcheck.Retries = imageConf.Healthcheck.Retries
+			}
+		}
+	}
+
 	if userConf.WorkingDir == "" {
 	if userConf.WorkingDir == "" {
 		userConf.WorkingDir = imageConf.WorkingDir
 		userConf.WorkingDir = imageConf.WorkingDir
 	}
 	}

+ 23 - 5
daemon/exec.go

@@ -14,11 +14,15 @@ import (
 	"github.com/docker/docker/errors"
 	"github.com/docker/docker/errors"
 	"github.com/docker/docker/libcontainerd"
 	"github.com/docker/docker/libcontainerd"
 	"github.com/docker/docker/pkg/pools"
 	"github.com/docker/docker/pkg/pools"
+	"github.com/docker/docker/pkg/signal"
 	"github.com/docker/docker/pkg/term"
 	"github.com/docker/docker/pkg/term"
 	"github.com/docker/engine-api/types"
 	"github.com/docker/engine-api/types"
 	"github.com/docker/engine-api/types/strslice"
 	"github.com/docker/engine-api/types/strslice"
 )
 )
 
 
+// Seconds to wait after sending TERM before trying KILL
+const termProcessTimeout = 10
+
 func (d *Daemon) registerExecCommand(container *container.Container, config *exec.Config) {
 func (d *Daemon) registerExecCommand(container *container.Container, config *exec.Config) {
 	// Storing execs in container in order to kill them gracefully whenever the container is stopped or removed.
 	// Storing execs in container in order to kill them gracefully whenever the container is stopped or removed.
 	container.ExecCommands.Add(config.ID, config)
 	container.ExecCommands.Add(config.ID, config)
@@ -130,7 +134,8 @@ func (d *Daemon) ContainerExecCreate(name string, config *types.ExecConfig) (str
 
 
 // ContainerExecStart starts a previously set up exec instance. The
 // ContainerExecStart starts a previously set up exec instance. The
 // std streams are set up.
 // std streams are set up.
-func (d *Daemon) ContainerExecStart(name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) (err error) {
+// If ctx is cancelled, the process is terminated.
+func (d *Daemon) ContainerExecStart(ctx context.Context, name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) (err error) {
 	var (
 	var (
 		cStdin           io.ReadCloser
 		cStdin           io.ReadCloser
 		cStdout, cStderr io.Writer
 		cStdout, cStderr io.Writer
@@ -197,15 +202,28 @@ func (d *Daemon) ContainerExecStart(name string, stdin io.ReadCloser, stdout io.
 		return nil
 		return nil
 	}
 	}
 
 
-	attachErr := container.AttachStreams(context.Background(), ec.StreamConfig, ec.OpenStdin, true, ec.Tty, cStdin, cStdout, cStderr, ec.DetachKeys)
+	attachErr := container.AttachStreams(ctx, ec.StreamConfig, ec.OpenStdin, true, ec.Tty, cStdin, cStdout, cStderr, ec.DetachKeys)
 
 
 	if err := d.containerd.AddProcess(c.ID, name, p); err != nil {
 	if err := d.containerd.AddProcess(c.ID, name, p); err != nil {
 		return err
 		return err
 	}
 	}
 
 
-	err = <-attachErr
-	if err != nil {
-		return fmt.Errorf("attach failed with error: %v", err)
+	select {
+	case <-ctx.Done():
+		logrus.Debugf("Sending TERM signal to process %v in container %v", name, c.ID)
+		d.containerd.SignalProcess(c.ID, name, int(signal.SignalMap["TERM"]))
+		select {
+		case <-time.After(termProcessTimeout * time.Second):
+			logrus.Infof("Container %v, process %v failed to exit within %d seconds of signal TERM - using the force", c.ID, name, termProcessTimeout)
+			d.containerd.SignalProcess(c.ID, name, int(signal.SignalMap["KILL"]))
+		case <-attachErr:
+			// TERM signal worked
+		}
+		return fmt.Errorf("context cancelled")
+	case err := <-attachErr:
+		if err != nil {
+			return fmt.Errorf("attach failed with error: %v", err)
+		}
 	}
 	}
 	return nil
 	return nil
 }
 }

+ 314 - 0
daemon/health.go

@@ -0,0 +1,314 @@
+package daemon
+
+import (
+	"bytes"
+	"fmt"
+	"runtime"
+	"strings"
+	"time"
+
+	"golang.org/x/net/context"
+
+	"github.com/Sirupsen/logrus"
+	"github.com/docker/docker/container"
+	"github.com/docker/docker/daemon/exec"
+	"github.com/docker/engine-api/types"
+	"github.com/docker/engine-api/types/strslice"
+)
+
+const (
+	// Longest healthcheck probe output message to store. Longer messages will be truncated.
+	maxOutputLen = 4096
+
+	// Default interval between probe runs (from the end of the first to the start of the second).
+	// Also the time before the first probe.
+	defaultProbeInterval = 30 * time.Second
+
+	// The maximum length of time a single probe run should take. If the probe takes longer
+	// than this, the check is considered to have failed.
+	defaultProbeTimeout = 30 * time.Second
+
+	// Shut down a container if it becomes Unhealthy.
+	defaultExitOnUnhealthy = true
+
+	// Maximum number of entries to record
+	maxLogEntries = 5
+)
+
+const (
+	// Exit status codes that can be returned by the probe command.
+
+	exitStatusHealthy   = 0 // Container is healthy
+	exitStatusUnhealthy = 1 // Container is unhealthy
+	exitStatusStarting  = 2 // Container needs more time to start
+)
+
+// probe implementations know how to run a particular type of probe.
+type probe interface {
+	// Perform one run of the check. Returns the exit code and an optional
+	// short diagnostic string.
+	run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
+}
+
+// cmdProbe implements the "CMD" probe type.
+type cmdProbe struct {
+	// Run the command with the system's default shell instead of execing it directly.
+	shell bool
+}
+
+// exec the healthcheck command in the container.
+// Returns the exit code and probe output (if any)
+func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) {
+	cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:]
+	if p.shell {
+		if runtime.GOOS != "windows" {
+			cmdSlice = append([]string{"/bin/sh", "-c"}, cmdSlice...)
+		} else {
+			cmdSlice = append([]string{"cmd", "/S", "/C"}, cmdSlice...)
+		}
+	}
+	entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
+	execConfig := exec.NewConfig()
+	execConfig.OpenStdin = false
+	execConfig.OpenStdout = true
+	execConfig.OpenStderr = true
+	execConfig.ContainerID = container.ID
+	execConfig.DetachKeys = []byte{}
+	execConfig.Entrypoint = entrypoint
+	execConfig.Args = args
+	execConfig.Tty = false
+	execConfig.Privileged = false
+	execConfig.User = container.Config.User
+
+	d.registerExecCommand(container, execConfig)
+	d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
+
+	output := &limitedBuffer{}
+	err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
+	if err != nil {
+		return nil, err
+	}
+	info, err := d.getExecConfig(execConfig.ID)
+	if err != nil {
+		return nil, err
+	}
+	if info.ExitCode == nil {
+		return nil, fmt.Errorf("Healthcheck has no exit code!")
+	}
+	// Note: Go's json package will handle invalid UTF-8 for us
+	out := output.String()
+	return &types.HealthcheckResult{
+		End:      time.Now(),
+		ExitCode: *info.ExitCode,
+		Output:   out,
+	}, nil
+}
+
+// Update the container's Status.Health struct based on the latest probe's result.
+func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) {
+	c.Lock()
+	defer c.Unlock()
+
+	retries := c.Config.Healthcheck.Retries
+	if retries <= 0 {
+		retries = 1 // Default if unset or set to an invalid value
+	}
+
+	h := c.State.Health
+	oldStatus := h.Status
+
+	if len(h.Log) >= maxLogEntries {
+		h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
+	} else {
+		h.Log = append(h.Log, result)
+	}
+
+	if result.ExitCode == exitStatusHealthy {
+		h.FailingStreak = 0
+		h.Status = types.Healthy
+	} else if result.ExitCode == exitStatusStarting && c.State.Health.Status == types.Starting {
+		// The container is not ready yet. Remain in the starting state.
+	} else {
+		// Failure (incuding invalid exit code)
+		h.FailingStreak++
+		if c.State.Health.FailingStreak >= retries {
+			h.Status = types.Unhealthy
+		}
+		// Else we're starting or healthy. Stay in that state.
+	}
+
+	if oldStatus != h.Status {
+		d.LogContainerEvent(c, "health_status: "+h.Status)
+	}
+}
+
+// Run the container's monitoring thread until notified via "stop".
+// There is never more than one monitor thread running per container at a time.
+func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
+	probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
+	probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
+	for {
+		select {
+		case <-stop:
+			logrus.Debugf("Stop healthcheck monitoring (received while idle)")
+			return
+		case <-time.After(probeInterval):
+			logrus.Debugf("Running health check...")
+			startTime := time.Now()
+			ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
+			results := make(chan *types.HealthcheckResult)
+			go func() {
+				result, err := probe.run(ctx, d, c)
+				if err != nil {
+					logrus.Warnf("Health check error: %v", err)
+					results <- &types.HealthcheckResult{
+						ExitCode: -1,
+						Output:   err.Error(),
+						Start:    startTime,
+						End:      time.Now(),
+					}
+				} else {
+					result.Start = startTime
+					logrus.Debugf("Health check done (exitCode=%d)", result.ExitCode)
+					results <- result
+				}
+				close(results)
+			}()
+			select {
+			case <-stop:
+				logrus.Debugf("Stop healthcheck monitoring (received while probing)")
+				// Stop timeout and kill probe, but don't wait for probe to exit.
+				cancelProbe()
+				return
+			case result := <-results:
+				handleProbeResult(d, c, result)
+				// Stop timeout
+				cancelProbe()
+			case <-ctx.Done():
+				logrus.Debugf("Health check taking too long")
+				handleProbeResult(d, c, &types.HealthcheckResult{
+					ExitCode: -1,
+					Output:   fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
+					Start:    startTime,
+					End:      time.Now(),
+				})
+				cancelProbe()
+				// Wait for probe to exit (it might take a while to respond to the TERM
+				// signal and we don't want dying probes to pile up).
+				<-results
+			}
+		}
+	}
+}
+
+// Get a suitable probe implementation for the container's healthcheck configuration.
+func getProbe(c *container.Container) probe {
+	config := c.Config.Healthcheck
+	if config == nil || len(config.Test) == 0 {
+		return nil
+	}
+	switch config.Test[0] {
+	case "CMD":
+		return &cmdProbe{shell: false}
+	case "CMD-SHELL":
+		return &cmdProbe{shell: true}
+	default:
+		logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD')", config.Test[0])
+		return nil
+	}
+}
+
+// Ensure the health-check monitor is running or not, depending on the current
+// state of the container.
+// Called from monitor.go, with c locked.
+func (d *Daemon) updateHealthMonitor(c *container.Container) {
+	h := c.State.Health
+	if h == nil {
+		return // No healthcheck configured
+	}
+
+	probe := getProbe(c)
+	wantRunning := c.Running && !c.Paused && probe != nil
+	if wantRunning {
+		if stop := h.OpenMonitorChannel(); stop != nil {
+			go monitor(d, c, stop, probe)
+		}
+	} else {
+		h.CloseMonitorChannel()
+	}
+}
+
+// Reset the health state for a newly-started, restarted or restored container.
+// initHealthMonitor is called from monitor.go and we should never be running
+// two instances at once.
+// Called with c locked.
+func (d *Daemon) initHealthMonitor(c *container.Container) {
+	if c.Config.Healthcheck == nil {
+		return
+	}
+
+	// This is needed in case we're auto-restarting
+	d.stopHealthchecks(c)
+
+	if c.State.Health == nil {
+		h := &container.Health{}
+		h.Status = types.Starting
+		h.FailingStreak = 0
+		c.State.Health = h
+	}
+
+	d.updateHealthMonitor(c)
+}
+
+// Called when the container is being stopped (whether because the health check is
+// failing or for any other reason).
+func (d *Daemon) stopHealthchecks(c *container.Container) {
+	h := c.State.Health
+	if h != nil {
+		h.CloseMonitorChannel()
+	}
+}
+
+// Buffer up to maxOutputLen bytes. Further data is discarded.
+type limitedBuffer struct {
+	buf       bytes.Buffer
+	truncated bool // indicates that data has been lost
+}
+
+// Append to limitedBuffer while there is room.
+func (b *limitedBuffer) Write(data []byte) (int, error) {
+	bufLen := b.buf.Len()
+	dataLen := len(data)
+	keep := min(maxOutputLen-bufLen, dataLen)
+	if keep > 0 {
+		b.buf.Write(data[:keep])
+	}
+	if keep < dataLen {
+		b.truncated = true
+	}
+	return dataLen, nil
+}
+
+// The contents of the buffer, with "..." appended if it overflowed.
+func (b *limitedBuffer) String() string {
+	out := b.buf.String()
+	if b.truncated {
+		out = out + "..."
+	}
+	return out
+}
+
+// If configuredValue is zero, use defaultValue instead.
+func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
+	if configuredValue == 0 {
+		return defaultValue
+	}
+	return configuredValue
+}
+
+func min(x, y int) int {
+	if x < y {
+		return x
+	}
+	return y
+}

+ 112 - 0
daemon/health_test.go

@@ -0,0 +1,112 @@
+package daemon
+
+import (
+	"testing"
+	"time"
+
+	"github.com/docker/docker/container"
+	"github.com/docker/docker/daemon/events"
+	"github.com/docker/engine-api/types"
+	containertypes "github.com/docker/engine-api/types/container"
+	eventtypes "github.com/docker/engine-api/types/events"
+)
+
+func reset(c *container.Container) {
+	c.State = &container.State{}
+	c.State.Health = &container.Health{}
+	c.State.Health.Status = types.Starting
+}
+
+func TestHealthStates(t *testing.T) {
+	e := events.New()
+	_, l, _ := e.Subscribe()
+	defer e.Evict(l)
+
+	expect := func(expected string) {
+		select {
+		case event := <-l:
+			ev := event.(eventtypes.Message)
+			if ev.Status != expected {
+				t.Errorf("Expecting event %#v, but got %#v\n", expected, ev.Status)
+			}
+		case <-time.After(1 * time.Second):
+			t.Errorf("Expecting event %#v, but got nothing\n", expected)
+		}
+	}
+
+	c := &container.Container{
+		CommonContainer: container.CommonContainer{
+			ID:   "container_id",
+			Name: "container_name",
+			Config: &containertypes.Config{
+				Image: "image_name",
+			},
+		},
+	}
+	daemon := &Daemon{
+		EventsService: e,
+	}
+
+	c.Config.Healthcheck = &containertypes.HealthConfig{
+		Retries: 1,
+	}
+
+	reset(c)
+
+	handleResult := func(startTime time.Time, exitCode int) {
+		handleProbeResult(daemon, c, &types.HealthcheckResult{
+			Start:    startTime,
+			End:      startTime,
+			ExitCode: exitCode,
+		})
+	}
+
+	// starting -> failed -> success -> failed
+
+	handleResult(c.State.StartedAt.Add(1*time.Second), 1)
+	expect("health_status: unhealthy")
+
+	handleResult(c.State.StartedAt.Add(2*time.Second), 0)
+	expect("health_status: healthy")
+
+	handleResult(c.State.StartedAt.Add(3*time.Second), 1)
+	expect("health_status: unhealthy")
+
+	// starting -> starting -> starting ->
+	// healthy -> starting (invalid transition)
+
+	reset(c)
+
+	handleResult(c.State.StartedAt.Add(20*time.Second), 2)
+	handleResult(c.State.StartedAt.Add(40*time.Second), 2)
+	if c.State.Health.Status != types.Starting {
+		t.Errorf("Expecting starting, but got %#v\n", c.State.Health.Status)
+	}
+
+	handleResult(c.State.StartedAt.Add(50*time.Second), 0)
+	expect("health_status: healthy")
+	handleResult(c.State.StartedAt.Add(60*time.Second), 2)
+	expect("health_status: unhealthy")
+
+	// Test retries
+
+	reset(c)
+	c.Config.Healthcheck.Retries = 3
+
+	handleResult(c.State.StartedAt.Add(20*time.Second), 1)
+	handleResult(c.State.StartedAt.Add(40*time.Second), 1)
+	if c.State.Health.Status != types.Starting {
+		t.Errorf("Expecting starting, but got %#v\n", c.State.Health.Status)
+	}
+	if c.State.Health.FailingStreak != 2 {
+		t.Errorf("Expecting FailingStreak=2, but got %d\n", c.State.Health.FailingStreak)
+	}
+	handleResult(c.State.StartedAt.Add(60*time.Second), 1)
+	expect("health_status: unhealthy")
+
+	handleResult(c.State.StartedAt.Add(80*time.Second), 0)
+	expect("health_status: healthy")
+	if c.State.Health.FailingStreak != 0 {
+		t.Errorf("Expecting FailingStreak=0, but got %d\n", c.State.Health.FailingStreak)
+	}
+}

+ 10 - 0
daemon/inspect.go

@@ -108,6 +108,15 @@ func (daemon *Daemon) getInspectData(container *container.Container, size bool)
 		hostConfig.Links = append(hostConfig.Links, fmt.Sprintf("%s:%s", child.Name, linkAlias))
 		hostConfig.Links = append(hostConfig.Links, fmt.Sprintf("%s:%s", child.Name, linkAlias))
 	}
 	}
 
 
+	var containerHealth *types.Health
+	if container.State.Health != nil {
+		containerHealth = &types.Health{
+			Status:        container.State.Health.Status,
+			FailingStreak: container.State.Health.FailingStreak,
+			Log:           append([]*types.HealthcheckResult{}, container.State.Health.Log...),
+		}
+	}
+
 	containerState := &types.ContainerState{
 	containerState := &types.ContainerState{
 		Status:     container.State.StateString(),
 		Status:     container.State.StateString(),
 		Running:    container.State.Running,
 		Running:    container.State.Running,
@@ -120,6 +129,7 @@ func (daemon *Daemon) getInspectData(container *container.Container, size bool)
 		Error:      container.State.Error,
 		Error:      container.State.Error,
 		StartedAt:  container.State.StartedAt.Format(time.RFC3339Nano),
 		StartedAt:  container.State.StartedAt.Format(time.RFC3339Nano),
 		FinishedAt: container.State.FinishedAt.Format(time.RFC3339Nano),
 		FinishedAt: container.State.FinishedAt.Format(time.RFC3339Nano),
+		Health:     containerHealth,
 	}
 	}
 
 
 	contJSONBase := &types.ContainerJSONBase{
 	contJSONBase := &types.ContainerJSONBase{

+ 9 - 0
daemon/monitor.go

@@ -25,6 +25,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
 		if runtime.GOOS == "windows" {
 		if runtime.GOOS == "windows" {
 			return errors.New("Received StateOOM from libcontainerd on Windows. This should never happen.")
 			return errors.New("Received StateOOM from libcontainerd on Windows. This should never happen.")
 		}
 		}
+		daemon.updateHealthMonitor(c)
 		daemon.LogContainerEvent(c, "oom")
 		daemon.LogContainerEvent(c, "oom")
 	case libcontainerd.StateExit:
 	case libcontainerd.StateExit:
 		c.Lock()
 		c.Lock()
@@ -35,6 +36,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
 		attributes := map[string]string{
 		attributes := map[string]string{
 			"exitCode": strconv.Itoa(int(e.ExitCode)),
 			"exitCode": strconv.Itoa(int(e.ExitCode)),
 		}
 		}
+		daemon.updateHealthMonitor(c)
 		daemon.LogContainerEventWithAttributes(c, "die", attributes)
 		daemon.LogContainerEventWithAttributes(c, "die", attributes)
 		daemon.Cleanup(c)
 		daemon.Cleanup(c)
 		// FIXME: here is race condition between two RUN instructions in Dockerfile
 		// FIXME: here is race condition between two RUN instructions in Dockerfile
@@ -54,6 +56,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
 			"exitCode": strconv.Itoa(int(e.ExitCode)),
 			"exitCode": strconv.Itoa(int(e.ExitCode)),
 		}
 		}
 		daemon.LogContainerEventWithAttributes(c, "die", attributes)
 		daemon.LogContainerEventWithAttributes(c, "die", attributes)
+		daemon.updateHealthMonitor(c)
 		return c.ToDisk()
 		return c.ToDisk()
 	case libcontainerd.StateExitProcess:
 	case libcontainerd.StateExitProcess:
 		c.Lock()
 		c.Lock()
@@ -74,18 +77,24 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
 			logrus.Warnf("Ignoring StateExitProcess for %v but no exec command found", e)
 			logrus.Warnf("Ignoring StateExitProcess for %v but no exec command found", e)
 		}
 		}
 	case libcontainerd.StateStart, libcontainerd.StateRestore:
 	case libcontainerd.StateStart, libcontainerd.StateRestore:
+		// Container is already locked in this case
 		c.SetRunning(int(e.Pid), e.State == libcontainerd.StateStart)
 		c.SetRunning(int(e.Pid), e.State == libcontainerd.StateStart)
 		c.HasBeenManuallyStopped = false
 		c.HasBeenManuallyStopped = false
 		if err := c.ToDisk(); err != nil {
 		if err := c.ToDisk(); err != nil {
 			c.Reset(false)
 			c.Reset(false)
 			return err
 			return err
 		}
 		}
+		daemon.initHealthMonitor(c)
 		daemon.LogContainerEvent(c, "start")
 		daemon.LogContainerEvent(c, "start")
 	case libcontainerd.StatePause:
 	case libcontainerd.StatePause:
+		// Container is already locked in this case
 		c.Paused = true
 		c.Paused = true
+		daemon.updateHealthMonitor(c)
 		daemon.LogContainerEvent(c, "pause")
 		daemon.LogContainerEvent(c, "pause")
 	case libcontainerd.StateResume:
 	case libcontainerd.StateResume:
+		// Container is already locked in this case
 		c.Paused = false
 		c.Paused = false
+		daemon.updateHealthMonitor(c)
 		daemon.LogContainerEvent(c, "unpause")
 		daemon.LogContainerEvent(c, "unpause")
 	}
 	}
 
 

+ 2 - 0
daemon/stop.go

@@ -41,6 +41,8 @@ func (daemon *Daemon) containerStop(container *container.Container, seconds int)
 		return nil
 		return nil
 	}
 	}
 
 
+	daemon.stopHealthchecks(container)
+
 	stopSignal := container.StopSignal()
 	stopSignal := container.StopSignal()
 	// 1. Send a stop signal
 	// 1. Send a stop signal
 	if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil {
 	if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil {

+ 67 - 0
docs/reference/builder.md

@@ -1470,6 +1470,73 @@ The `STOPSIGNAL` instruction sets the system call signal that will be sent to th
 This signal can be a valid unsigned number that matches a position in the kernel's syscall table, for instance 9,
 This signal can be a valid unsigned number that matches a position in the kernel's syscall table, for instance 9,
 or a signal name in the format SIGNAME, for instance SIGKILL.
 or a signal name in the format SIGNAME, for instance SIGKILL.
 
 
+## HEALTHCHECK
+
+The `HEALTHCHECK` instruction has two forms:
+
+* `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container)
+* `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image)
+
+The `HEALTHCHECK` instruction tells Docker how to test a container to check that
+it is still working. This can detect cases such as a web server that is stuck in
+an infinite loop and unable to handle new connections, even though the server
+process is still running.
+
+When a container has a healthcheck specified, it has a _health status_ in
+addition to its normal status. This status is initially `starting`. Whenever a
+health check passes, it becomes `healthy` (whatever state it was previously in).
+After a certain number of consecutive failures, it becomes `unhealthy`.
+
+The options that can appear before `CMD` are:
+
+* `--interval=DURATION` (default: `30s`)
+* `--timeout=DURATION` (default: `30s`)
+* `--retries=N` (default: `1`)
+
+The health check will first run **interval** seconds after the container is
+started, and then again **interval** seconds after each previous check completes.
+
+If a single run of the check takes longer than **timeout** seconds then the check
+is considered to have failed.
+
+It takes **retries** consecutive failures of the health check for the container
+to be considered `unhealthy`.
+
+There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list
+more than one then only the last `HEALTHCHECK` will take effect.
+
+The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK
+CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands;
+see e.g. `ENTRYPOINT` for details).
+
+The command's exit status indicates the health status of the container.
+The possible values are:
+
+- 0: success - the container is healthy and ready for use
+- 1: unhealthy - the container is not working correctly
+- 2: starting - the container is not ready for use yet, but is working correctly
+
+If the probe returns 2 ("starting") when the container has already moved out of the
+"starting" state then it is treated as "unhealthy" instead.
+
+For example, to check every five minutes or so that a web-server is able to
+serve the site's main page within three seconds:
+
+    HEALTHCHECK --interval=5m --timeout=3s \
+      CMD curl -f http://localhost/ || exit 1
+
+To help debug failing probes, any output text (UTF-8 encoded) that the command writes
+on stdout or stderr will be stored in the health status and can be queried with
+`docker inspect`. Such output should be kept short (only the first 4096 bytes
+are stored currently).
+
+When the health status of a container changes, a `health_status` event is
+generated with the new status.
+
+The `HEALTHCHECK` feature was added in Docker 1.12.
+
+
+
 ## Dockerfile examples
 ## Dockerfile examples
 
 
 Below you can see some examples of Dockerfile syntax. If you're interested in
 Below you can see some examples of Dockerfile syntax. If you're interested in

+ 60 - 0
docs/reference/run.md

@@ -1250,6 +1250,7 @@ Dockerfile instruction and how the operator can override that setting.
     #entrypoint-default-command-to-execute-at-runtime)
     #entrypoint-default-command-to-execute-at-runtime)
  - [EXPOSE (Incoming Ports)](#expose-incoming-ports)
  - [EXPOSE (Incoming Ports)](#expose-incoming-ports)
  - [ENV (Environment Variables)](#env-environment-variables)
  - [ENV (Environment Variables)](#env-environment-variables)
+ - [HEALTHCHECK](#healthcheck)
  - [VOLUME (Shared Filesystems)](#volume-shared-filesystems)
  - [VOLUME (Shared Filesystems)](#volume-shared-filesystems)
  - [USER](#user)
  - [USER](#user)
  - [WORKDIR](#workdir)
  - [WORKDIR](#workdir)
@@ -1398,6 +1399,65 @@ above, or already defined by the developer with a Dockerfile `ENV`:
 
 
 Similarly the operator can set the **hostname** with `-h`.
 Similarly the operator can set the **hostname** with `-h`.
 
 
+### HEALTHCHECK
+
+```
+  --health-cmd            Command to run to check health
+  --health-interval       Time between running the check
+  --health-retries        Consecutive failures needed to report unhealthy
+  --health-timeout        Maximum time to allow one check to run
+  --no-healthcheck        Disable any container-specified HEALTHCHECK
+```
+
+Example:
+
+    $ docker run --name=test -d \
+        --health-cmd='stat /etc/passwd || exit 1' \
+        --health-interval=2s \
+        busybox sleep 1d
+    $ sleep 2; docker inspect --format='{{.State.Health.Status}}' test
+    healthy
+    $ docker exec test rm /etc/passwd
+    $ sleep 2; docker inspect --format='{{json .State.Health}}' test
+    {
+      "Status": "unhealthy",
+      "FailingStreak": 3,
+      "Log": [
+        {
+          "Start": "2016-05-25T17:22:04.635478668Z",
+          "End": "2016-05-25T17:22:04.7272552Z",
+          "ExitCode": 0,
+          "Output": "  File: /etc/passwd\n  Size: 334       \tBlocks: 8          IO Block: 4096   regular file\nDevice: 32h/50d\tInode: 12          Links: 1\nAccess: (0664/-rw-rw-r--)  Uid: (    0/    root)   Gid: (    0/    root)\nAccess: 2015-12-05 22:05:32.000000000\nModify: 2015..."
+        },
+        {
+          "Start": "2016-05-25T17:22:06.732900633Z",
+          "End": "2016-05-25T17:22:06.822168935Z",
+          "ExitCode": 0,
+          "Output": "  File: /etc/passwd\n  Size: 334       \tBlocks: 8          IO Block: 4096   regular file\nDevice: 32h/50d\tInode: 12          Links: 1\nAccess: (0664/-rw-rw-r--)  Uid: (    0/    root)   Gid: (    0/    root)\nAccess: 2015-12-05 22:05:32.000000000\nModify: 2015..."
+        },
+        {
+          "Start": "2016-05-25T17:22:08.823956535Z",
+          "End": "2016-05-25T17:22:08.897359124Z",
+          "ExitCode": 1,
+          "Output": "stat: can't stat '/etc/passwd': No such file or directory\n"
+        },
+        {
+          "Start": "2016-05-25T17:22:10.898802931Z",
+          "End": "2016-05-25T17:22:10.969631866Z",
+          "ExitCode": 1,
+          "Output": "stat: can't stat '/etc/passwd': No such file or directory\n"
+        },
+        {
+          "Start": "2016-05-25T17:22:12.971033523Z",
+          "End": "2016-05-25T17:22:13.082015516Z",
+          "ExitCode": 1,
+          "Output": "stat: can't stat '/etc/passwd': No such file or directory\n"
+        }
+      ]
+    }
+
+The health status is also displayed in the `docker ps` output.
+
 ### TMPFS (mount tmpfs filesystems)
 ### TMPFS (mount tmpfs filesystems)
 
 
 ```bash
 ```bash

+ 154 - 0
integration-cli/docker_cli_health_test.go

@@ -0,0 +1,154 @@
+package main
+
+import (
+	"encoding/json"
+	"github.com/docker/docker/pkg/integration/checker"
+	"github.com/docker/engine-api/types"
+	"github.com/go-check/check"
+	"strconv"
+	"strings"
+	"time"
+)
+
+func waitForStatus(c *check.C, name string, prev string, expected string) {
+	prev = prev + "\n"
+	expected = expected + "\n"
+	for {
+		out, _ := dockerCmd(c, "inspect", "--format={{.State.Status}}", name)
+		if out == expected {
+			return
+		}
+		c.Check(out, checker.Equals, prev)
+		if out != prev {
+			return
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+}
+
+func waitForHealthStatus(c *check.C, name string, prev string, expected string) {
+	prev = prev + "\n"
+	expected = expected + "\n"
+	for {
+		out, _ := dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", name)
+		if out == expected {
+			return
+		}
+		c.Check(out, checker.Equals, prev)
+		if out != prev {
+			return
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+}
+
+func getHealth(c *check.C, name string) *types.Health {
+	out, _ := dockerCmd(c, "inspect", "--format={{json .State.Health}}", name)
+	var health types.Health
+	err := json.Unmarshal([]byte(out), &health)
+	c.Check(err, checker.Equals, nil)
+	return &health
+}
+
+func (s *DockerSuite) TestHealth(c *check.C) {
+	testRequires(c, DaemonIsLinux) // busybox doesn't work on Windows
+
+	imageName := "testhealth"
+	_, err := buildImage(imageName,
+		`FROM busybox
+		RUN echo OK > /status
+		CMD ["/bin/sleep", "120"]
+		STOPSIGNAL SIGKILL
+		HEALTHCHECK --interval=1s --timeout=30s \
+		  CMD cat /status`,
+		true)
+
+	c.Check(err, check.IsNil)
+
+	// No health status before starting
+	name := "test_health"
+	dockerCmd(c, "create", "--name", name, imageName)
+	out, _ := dockerCmd(c, "ps", "-a", "--format={{.Status}}")
+	c.Check(out, checker.Equals, "Created\n")
+
+	// Inspect the options
+	out, _ = dockerCmd(c, "inspect",
+		"--format='timeout={{.Config.Healthcheck.Timeout}} "+
+			"interval={{.Config.Healthcheck.Interval}} "+
+			"retries={{.Config.Healthcheck.Retries}} "+
+			"test={{.Config.Healthcheck.Test}}'", name)
+	c.Check(out, checker.Equals, "timeout=30s interval=1s retries=0 test=[CMD-SHELL cat /status]\n")
+
+	// Start
+	dockerCmd(c, "start", name)
+	waitForHealthStatus(c, name, "starting", "healthy")
+
+	// Make it fail
+	dockerCmd(c, "exec", name, "rm", "/status")
+	waitForHealthStatus(c, name, "healthy", "unhealthy")
+
+	// Inspect the status
+	out, _ = dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", name)
+	c.Check(out, checker.Equals, "unhealthy\n")
+
+	// Make it healthy again
+	dockerCmd(c, "exec", name, "touch", "/status")
+	waitForHealthStatus(c, name, "unhealthy", "healthy")
+
+	// Remove container
+	dockerCmd(c, "rm", "-f", name)
+
+	// Disable the check from the CLI
+	out, _ = dockerCmd(c, "create", "--name=noh", "--no-healthcheck", imageName)
+	out, _ = dockerCmd(c, "inspect", "--format={{.Config.Healthcheck.Test}}", "noh")
+	c.Check(out, checker.Equals, "[NONE]\n")
+	dockerCmd(c, "rm", "noh")
+
+	// Disable the check with a new build
+	_, err = buildImage("no_healthcheck",
+		`FROM testhealth
+		HEALTHCHECK NONE`, true)
+	c.Check(err, check.IsNil)
+
+	out, _ = dockerCmd(c, "inspect", "--format={{.ContainerConfig.Healthcheck.Test}}", "no_healthcheck")
+	c.Check(out, checker.Equals, "[NONE]\n")
+
+	// Enable the checks from the CLI
+	_, _ = dockerCmd(c, "run", "-d", "--name=fatal_healthcheck",
+		"--health-interval=0.5s",
+		"--health-retries=3",
+		"--health-cmd=cat /status",
+		"no_healthcheck")
+	waitForHealthStatus(c, "fatal_healthcheck", "starting", "healthy")
+	health := getHealth(c, "fatal_healthcheck")
+	c.Check(health.Status, checker.Equals, "healthy")
+	c.Check(health.FailingStreak, checker.Equals, 0)
+	last := health.Log[len(health.Log)-1]
+	c.Check(last.ExitCode, checker.Equals, 0)
+	c.Check(last.Output, checker.Equals, "OK\n")
+
+	// Fail the check, which should now make it exit
+	dockerCmd(c, "exec", "fatal_healthcheck", "rm", "/status")
+	waitForStatus(c, "fatal_healthcheck", "running", "exited")
+
+	out, _ = dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", "fatal_healthcheck")
+	c.Check(out, checker.Equals, "unhealthy\n")
+	failsStr, _ := dockerCmd(c, "inspect", "--format={{.State.Health.FailingStreak}}", "fatal_healthcheck")
+	fails, err := strconv.Atoi(strings.TrimSpace(failsStr))
+	c.Check(err, check.IsNil)
+	c.Check(fails >= 3, checker.Equals, true)
+	dockerCmd(c, "rm", "-f", "fatal_healthcheck")
+
+	// Check timeout
+	// Note: if the interval is too small, it seems that Docker spends all its time running health
+	// checks and never gets around to killing it.
+	_, _ = dockerCmd(c, "run", "-d", "--name=test",
+		"--health-interval=1s", "--health-cmd=sleep 5m", "--health-timeout=1ms", imageName)
+	waitForHealthStatus(c, "test", "starting", "unhealthy")
+	health = getHealth(c, "test")
+	last = health.Log[len(health.Log)-1]
+	c.Check(health.Status, checker.Equals, "unhealthy")
+	c.Check(last.ExitCode, checker.Equals, -1)
+	c.Check(last.Output, checker.Equals, "Health check exceeded timeout (1ms)")
+	dockerCmd(c, "rm", "-f", "test")
+}

+ 11 - 0
libcontainerd/client_linux.go

@@ -190,6 +190,17 @@ func (clnt *client) Signal(containerID string, sig int) error {
 	return err
 	return err
 }
 }
 
 
+func (clnt *client) SignalProcess(containerID string, pid string, sig int) error {
+	clnt.lock(containerID)
+	defer clnt.unlock(containerID)
+	_, err := clnt.remote.apiClient.Signal(context.Background(), &containerd.SignalRequest{
+		Id:     containerID,
+		Pid:    pid,
+		Signal: uint32(sig),
+	})
+	return err
+}
+
 func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error {
 func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error {
 	clnt.lock(containerID)
 	clnt.lock(containerID)
 	defer clnt.unlock(containerID)
 	defer clnt.unlock(containerID)

+ 19 - 0
libcontainerd/client_windows.go

@@ -304,6 +304,25 @@ func (clnt *client) Signal(containerID string, sig int) error {
 	return nil
 	return nil
 }
 }
 
 
+// While Linux has support for the full range of signals, signals aren't really implemented on Windows.
+// We try to terminate the specified process whatever signal is requested.
+func (clnt *client) SignalProcess(containerID string, processFriendlyName string, sig int) error {
+	clnt.lock(containerID)
+	defer clnt.unlock(containerID)
+	cont, err := clnt.getContainer(containerID)
+	if err != nil {
+		return err
+	}
+
+	for _, p := range cont.processes {
+		if p.friendlyName == processFriendlyName {
+			return hcsshim.TerminateProcessInComputeSystem(containerID, p.systemPid)
+		}
+	}
+
+	return fmt.Errorf("SignalProcess could not find process %s in %s", processFriendlyName, containerID)
+}
+
 // Resize handles a CLI event to resize an interactive docker run or docker exec
 // Resize handles a CLI event to resize an interactive docker run or docker exec
 // window.
 // window.
 func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error {
 func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error {

+ 1 - 0
libcontainerd/types.go

@@ -34,6 +34,7 @@ type Backend interface {
 type Client interface {
 type Client interface {
 	Create(containerID string, spec Spec, options ...CreateOption) error
 	Create(containerID string, spec Spec, options ...CreateOption) error
 	Signal(containerID string, sig int) error
 	Signal(containerID string, sig int) error
+	SignalProcess(containerID string, processFriendlyName string, sig int) error
 	AddProcess(containerID, processFriendlyName string, process Process) error
 	AddProcess(containerID, processFriendlyName string, process Process) error
 	Resize(containerID, processFriendlyName string, width, height int) error
 	Resize(containerID, processFriendlyName string, width, height int) error
 	Pause(containerID string) error
 	Pause(containerID string) error

+ 40 - 0
runconfig/opts/parse.go

@@ -100,6 +100,12 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
 		flStopSignal        = cmd.String([]string{"-stop-signal"}, signal.DefaultStopSignal, fmt.Sprintf("Signal to stop a container, %v by default", signal.DefaultStopSignal))
 		flStopSignal        = cmd.String([]string{"-stop-signal"}, signal.DefaultStopSignal, fmt.Sprintf("Signal to stop a container, %v by default", signal.DefaultStopSignal))
 		flIsolation         = cmd.String([]string{"-isolation"}, "", "Container isolation technology")
 		flIsolation         = cmd.String([]string{"-isolation"}, "", "Container isolation technology")
 		flShmSize           = cmd.String([]string{"-shm-size"}, "", "Size of /dev/shm, default value is 64MB")
 		flShmSize           = cmd.String([]string{"-shm-size"}, "", "Size of /dev/shm, default value is 64MB")
+		// Healthcheck
+		flNoHealthcheck  = cmd.Bool([]string{"-no-healthcheck"}, false, "Disable any container-specified HEALTHCHECK")
+		flHealthCmd      = cmd.String([]string{"-health-cmd"}, "", "Command to run to check health")
+		flHealthInterval = cmd.Duration([]string{"-health-interval"}, 0, "Time between running the check")
+		flHealthTimeout  = cmd.Duration([]string{"-health-timeout"}, 0, "Maximum time to allow one check to run")
+		flHealthRetries  = cmd.Int([]string{"-health-retries"}, 0, "Consecutive failures needed to report unhealthy")
 	)
 	)
 
 
 	cmd.Var(&flAttach, []string{"a", "-attach"}, "Attach to STDIN, STDOUT or STDERR")
 	cmd.Var(&flAttach, []string{"a", "-attach"}, "Attach to STDIN, STDOUT or STDERR")
@@ -351,6 +357,39 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
 		return nil, nil, nil, cmd, err
 		return nil, nil, nil, cmd, err
 	}
 	}
 
 
+	// Healthcheck
+	var healthConfig *container.HealthConfig
+	haveHealthSettings := *flHealthCmd != "" ||
+		*flHealthInterval != 0 ||
+		*flHealthTimeout != 0 ||
+		*flHealthRetries != 0
+	if *flNoHealthcheck {
+		if haveHealthSettings {
+			return nil, nil, nil, cmd, fmt.Errorf("--no-healthcheck conflicts with --health-* options")
+		}
+		test := strslice.StrSlice{"NONE"}
+		healthConfig = &container.HealthConfig{Test: test}
+	} else if haveHealthSettings {
+		var probe strslice.StrSlice
+		if *flHealthCmd != "" {
+			args := []string{"CMD-SHELL", *flHealthCmd}
+			probe = strslice.StrSlice(args)
+		}
+		if *flHealthInterval < 0 {
+			return nil, nil, nil, cmd, fmt.Errorf("--health-interval cannot be negative")
+		}
+		if *flHealthTimeout < 0 {
+			return nil, nil, nil, cmd, fmt.Errorf("--health-timeout cannot be negative")
+		}
+
+		healthConfig = &container.HealthConfig{
+			Test:     probe,
+			Interval: *flHealthInterval,
+			Timeout:  *flHealthTimeout,
+			Retries:  *flHealthRetries,
+		}
+	}
+
 	resources := container.Resources{
 	resources := container.Resources{
 		CgroupParent:         *flCgroupParent,
 		CgroupParent:         *flCgroupParent,
 		Memory:               flMemory,
 		Memory:               flMemory,
@@ -399,6 +438,7 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
 		Entrypoint:      entrypoint,
 		Entrypoint:      entrypoint,
 		WorkingDir:      *flWorkingDir,
 		WorkingDir:      *flWorkingDir,
 		Labels:          ConvertKVStringsToMap(labels),
 		Labels:          ConvertKVStringsToMap(labels),
+		Healthcheck:     healthConfig,
 	}
 	}
 	if cmd.IsSet("-stop-signal") {
 	if cmd.IsSet("-stop-signal") {
 		config.StopSignal = *flStopSignal
 		config.StopSignal = *flStopSignal

+ 40 - 0
runconfig/opts/parse_test.go

@@ -9,6 +9,7 @@ import (
 	"runtime"
 	"runtime"
 	"strings"
 	"strings"
 	"testing"
 	"testing"
+	"time"
 
 
 	flag "github.com/docker/docker/pkg/mflag"
 	flag "github.com/docker/docker/pkg/mflag"
 	"github.com/docker/docker/runconfig"
 	"github.com/docker/docker/runconfig"
@@ -584,6 +585,45 @@ func TestParseRestartPolicy(t *testing.T) {
 	}
 	}
 }
 }
 
 
+func TestParseHealth(t *testing.T) {
+	checkOk := func(args ...string) *container.HealthConfig {
+		config, _, _, _, err := parseRun(args)
+		if err != nil {
+			t.Fatalf("%#v: %v", args, err)
+		}
+		return config.Healthcheck
+	}
+	checkError := func(expected string, args ...string) {
+		config, _, _, _, err := parseRun(args)
+		if err == nil {
+			t.Fatalf("Expected error, but got %#v", config)
+		}
+		if err.Error() != expected {
+			t.Fatalf("Expected %#v, got %#v", expected, err)
+		}
+	}
+	health := checkOk("--no-healthcheck", "img", "cmd")
+	if health == nil || len(health.Test) != 1 || health.Test[0] != "NONE" {
+		t.Fatalf("--no-healthcheck failed: %#v", health)
+	}
+
+	health = checkOk("--health-cmd=/check.sh -q", "img", "cmd")
+	if len(health.Test) != 2 || health.Test[0] != "CMD-SHELL" || health.Test[1] != "/check.sh -q" {
+		t.Fatalf("--health-cmd: got %#v", health.Test)
+	}
+	if health.Timeout != 0 {
+		t.Fatalf("--health-cmd: timeout = %f", health.Timeout)
+	}
+
+	checkError("--no-healthcheck conflicts with --health-* options",
+		"--no-healthcheck", "--health-cmd=/check.sh -q", "img", "cmd")
+
+	health = checkOk("--health-timeout=2s", "--health-retries=3", "--health-interval=4.5s", "img", "cmd")
+	if health.Timeout != 2*time.Second || health.Retries != 3 || health.Interval != 4500*time.Millisecond {
+		t.Fatalf("--health-*: got %#v", health)
+	}
+}
+
 func TestParseLoggingOpts(t *testing.T) {
 func TestParseLoggingOpts(t *testing.T) {
 	// logging opts ko
 	// logging opts ko
 	if _, _, _, _, err := parseRun([]string{"--log-driver=none", "--log-opt=anything", "img", "cmd"}); err == nil || err.Error() != "invalid logging opts for driver none" {
 	if _, _, _, _, err := parseRun([]string{"--log-driver=none", "--log-opt=anything", "img", "cmd"}); err == nil || err.Error() != "invalid logging opts for driver none" {