Selaa lähdekoodia

Add User defined Healthchecks for Containers

Carry of #22719
Michael Crosby 9 vuotta sitten
vanhempi
commit
ce255f76c7

+ 1 - 1
api/server/router/container/backend.go

@@ -17,7 +17,7 @@ type execBackend interface {
 	ContainerExecCreate(name string, config *types.ExecConfig) (string, error)
 	ContainerExecCreate(name string, config *types.ExecConfig) (string, error)
 	ContainerExecInspect(id string) (*backend.ExecInspect, error)
 	ContainerExecInspect(id string) (*backend.ExecInspect, error)
 	ContainerExecResize(name string, height, width int) error
 	ContainerExecResize(name string, height, width int) error
-	ContainerExecStart(name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) error
+	ContainerExecStart(ctx context.Context, name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) error
 	ExecExists(name string) (bool, error)
 	ExecExists(name string) (bool, error)
 }
 }
 
 

+ 2 - 1
api/server/router/container/exec.go

@@ -106,7 +106,8 @@ func (s *containerRouter) postContainerExecStart(ctx context.Context, w http.Res
 	}
 	}
 
 
 	// Now run the user process in container.
 	// Now run the user process in container.
-	if err := s.backend.ContainerExecStart(execName, stdin, stdout, stderr); err != nil {
+	// Maybe we should we pass ctx here if we're not detaching?
+	if err := s.backend.ContainerExecStart(context.Background(), execName, stdin, stdout, stderr); err != nil {
 		if execStartCheck.Detach {
 		if execStartCheck.Detach {
 			return err
 			return err
 		}
 		}

+ 10 - 9
builder/dockerfile/builder.go

@@ -22,15 +22,16 @@ import (
 )
 )
 
 
 var validCommitCommands = map[string]bool{
 var validCommitCommands = map[string]bool{
-	"cmd":        true,
-	"entrypoint": true,
-	"env":        true,
-	"expose":     true,
-	"label":      true,
-	"onbuild":    true,
-	"user":       true,
-	"volume":     true,
-	"workdir":    true,
+	"cmd":         true,
+	"entrypoint":  true,
+	"healthcheck": true,
+	"env":         true,
+	"expose":      true,
+	"label":       true,
+	"onbuild":     true,
+	"user":        true,
+	"volume":      true,
+	"workdir":     true,
 }
 }
 
 
 // BuiltinAllowedBuildArgs is list of built-in allowed build args
 // BuiltinAllowedBuildArgs is list of built-in allowed build args

+ 34 - 32
builder/dockerfile/command/command.go

@@ -3,40 +3,42 @@ package command
 
 
 // Define constants for the command strings
 // Define constants for the command strings
 const (
 const (
-	Env        = "env"
-	Label      = "label"
-	Maintainer = "maintainer"
-	Add        = "add"
-	Copy       = "copy"
-	From       = "from"
-	Onbuild    = "onbuild"
-	Workdir    = "workdir"
-	Run        = "run"
-	Cmd        = "cmd"
-	Entrypoint = "entrypoint"
-	Expose     = "expose"
-	Volume     = "volume"
-	User       = "user"
-	StopSignal = "stopsignal"
-	Arg        = "arg"
+	Env         = "env"
+	Label       = "label"
+	Maintainer  = "maintainer"
+	Add         = "add"
+	Copy        = "copy"
+	From        = "from"
+	Onbuild     = "onbuild"
+	Workdir     = "workdir"
+	Run         = "run"
+	Cmd         = "cmd"
+	Entrypoint  = "entrypoint"
+	Expose      = "expose"
+	Volume      = "volume"
+	User        = "user"
+	StopSignal  = "stopsignal"
+	Arg         = "arg"
+	Healthcheck = "healthcheck"
 )
 )
 
 
 // Commands is list of all Dockerfile commands
 // Commands is list of all Dockerfile commands
 var Commands = map[string]struct{}{
 var Commands = map[string]struct{}{
-	Env:        {},
-	Label:      {},
-	Maintainer: {},
-	Add:        {},
-	Copy:       {},
-	From:       {},
-	Onbuild:    {},
-	Workdir:    {},
-	Run:        {},
-	Cmd:        {},
-	Entrypoint: {},
-	Expose:     {},
-	Volume:     {},
-	User:       {},
-	StopSignal: {},
-	Arg:        {},
+	Env:         {},
+	Label:       {},
+	Maintainer:  {},
+	Add:         {},
+	Copy:        {},
+	From:        {},
+	Onbuild:     {},
+	Workdir:     {},
+	Run:         {},
+	Cmd:         {},
+	Entrypoint:  {},
+	Expose:      {},
+	Volume:      {},
+	User:        {},
+	StopSignal:  {},
+	Arg:         {},
+	Healthcheck: {},
 }
 }

+ 107 - 0
builder/dockerfile/dispatchers.go

@@ -12,7 +12,9 @@ import (
 	"regexp"
 	"regexp"
 	"runtime"
 	"runtime"
 	"sort"
 	"sort"
+	"strconv"
 	"strings"
 	"strings"
+	"time"
 
 
 	"github.com/Sirupsen/logrus"
 	"github.com/Sirupsen/logrus"
 	"github.com/docker/docker/api"
 	"github.com/docker/docker/api"
@@ -426,6 +428,111 @@ func cmd(b *Builder, args []string, attributes map[string]bool, original string)
 	return nil
 	return nil
 }
 }
 
 
+// parseOptInterval(flag) is the duration of flag.Value, or 0 if
+// empty. An error is reported if the value is given and is not positive.
+func parseOptInterval(f *Flag) (time.Duration, error) {
+	s := f.Value
+	if s == "" {
+		return 0, nil
+	}
+	d, err := time.ParseDuration(s)
+	if err != nil {
+		return 0, err
+	}
+	if d <= 0 {
+		return 0, fmt.Errorf("Interval %#v must be positive", f.name)
+	}
+	return d, nil
+}
+
+// HEALTHCHECK foo
+//
+// Set the default healthcheck command to run in the container (which may be empty).
+// Argument handling is the same as RUN.
+//
+func healthcheck(b *Builder, args []string, attributes map[string]bool, original string) error {
+	if len(args) == 0 {
+		return fmt.Errorf("HEALTHCHECK requires an argument")
+	}
+	typ := strings.ToUpper(args[0])
+	args = args[1:]
+	if typ == "NONE" {
+		if len(args) != 0 {
+			return fmt.Errorf("HEALTHCHECK NONE takes no arguments")
+		}
+		test := strslice.StrSlice{typ}
+		b.runConfig.Healthcheck = &container.HealthConfig{
+			Test: test,
+		}
+	} else {
+		if b.runConfig.Healthcheck != nil {
+			oldCmd := b.runConfig.Healthcheck.Test
+			if len(oldCmd) > 0 && oldCmd[0] != "NONE" {
+				fmt.Fprintf(b.Stdout, "Note: overriding previous HEALTHCHECK: %v\n", oldCmd)
+			}
+		}
+
+		healthcheck := container.HealthConfig{}
+
+		flInterval := b.flags.AddString("interval", "")
+		flTimeout := b.flags.AddString("timeout", "")
+		flRetries := b.flags.AddString("retries", "")
+
+		if err := b.flags.Parse(); err != nil {
+			return err
+		}
+
+		switch typ {
+		case "CMD":
+			cmdSlice := handleJSONArgs(args, attributes)
+			if len(cmdSlice) == 0 {
+				return fmt.Errorf("Missing command after HEALTHCHECK CMD")
+			}
+
+			if !attributes["json"] {
+				typ = "CMD-SHELL"
+			}
+
+			healthcheck.Test = strslice.StrSlice(append([]string{typ}, cmdSlice...))
+		default:
+			return fmt.Errorf("Unknown type %#v in HEALTHCHECK (try CMD)", typ)
+		}
+
+		interval, err := parseOptInterval(flInterval)
+		if err != nil {
+			return err
+		}
+		healthcheck.Interval = interval
+
+		timeout, err := parseOptInterval(flTimeout)
+		if err != nil {
+			return err
+		}
+		healthcheck.Timeout = timeout
+
+		if flRetries.Value != "" {
+			retries, err := strconv.ParseInt(flRetries.Value, 10, 32)
+			if err != nil {
+				return err
+			}
+			if retries < 1 {
+				return fmt.Errorf("--retries must be at least 1 (not %d)", retries)
+			}
+			healthcheck.Retries = int(retries)
+		} else {
+			healthcheck.Retries = 0
+		}
+
+		b.runConfig.Healthcheck = &healthcheck
+	}
+
+	if err := b.commit("", b.runConfig.Cmd, fmt.Sprintf("HEALTHCHECK %q", b.runConfig.Healthcheck)); err != nil {
+		return err
+	}
+
+	return nil
+}
+
 // ENTRYPOINT /usr/sbin/nginx
 // ENTRYPOINT /usr/sbin/nginx
 //
 //
 // Set the entrypoint (which defaults to sh -c on linux, or cmd /S /C on Windows) to
 // Set the entrypoint (which defaults to sh -c on linux, or cmd /S /C on Windows) to

+ 17 - 16
builder/dockerfile/evaluator.go

@@ -58,22 +58,23 @@ var evaluateTable map[string]func(*Builder, []string, map[string]bool, string) e
 
 
 func init() {
 func init() {
 	evaluateTable = map[string]func(*Builder, []string, map[string]bool, string) error{
 	evaluateTable = map[string]func(*Builder, []string, map[string]bool, string) error{
-		command.Env:        env,
-		command.Label:      label,
-		command.Maintainer: maintainer,
-		command.Add:        add,
-		command.Copy:       dispatchCopy, // copy() is a go builtin
-		command.From:       from,
-		command.Onbuild:    onbuild,
-		command.Workdir:    workdir,
-		command.Run:        run,
-		command.Cmd:        cmd,
-		command.Entrypoint: entrypoint,
-		command.Expose:     expose,
-		command.Volume:     volume,
-		command.User:       user,
-		command.StopSignal: stopSignal,
-		command.Arg:        arg,
+		command.Env:         env,
+		command.Label:       label,
+		command.Maintainer:  maintainer,
+		command.Add:         add,
+		command.Copy:        dispatchCopy, // copy() is a go builtin
+		command.From:        from,
+		command.Onbuild:     onbuild,
+		command.Workdir:     workdir,
+		command.Run:         run,
+		command.Cmd:         cmd,
+		command.Entrypoint:  entrypoint,
+		command.Expose:      expose,
+		command.Volume:      volume,
+		command.User:        user,
+		command.StopSignal:  stopSignal,
+		command.Arg:         arg,
+		command.Healthcheck: healthcheck,
 	}
 	}
 }
 }
 
 

+ 29 - 0
builder/dockerfile/parser/line_parsers.go

@@ -329,3 +329,32 @@ func parseMaybeJSONToList(rest string) (*Node, map[string]bool, error) {
 
 
 	return parseStringsWhitespaceDelimited(rest)
 	return parseStringsWhitespaceDelimited(rest)
 }
 }
+
+// The HEALTHCHECK command is like parseMaybeJSON, but has an extra type argument.
+func parseHealthConfig(rest string) (*Node, map[string]bool, error) {
+	// Find end of first argument
+	var sep int
+	for ; sep < len(rest); sep++ {
+		if unicode.IsSpace(rune(rest[sep])) {
+			break
+		}
+	}
+	next := sep
+	for ; next < len(rest); next++ {
+		if !unicode.IsSpace(rune(rest[next])) {
+			break
+		}
+	}
+
+	if sep == 0 {
+		return nil, nil, nil
+	}
+
+	typ := rest[:sep]
+	cmd, attrs, err := parseMaybeJSON(rest[next:])
+	if err != nil {
+		return nil, nil, err
+	}
+
+	return &Node{Value: typ, Next: cmd, Attributes: attrs}, nil, err
+}

+ 17 - 16
builder/dockerfile/parser/parser.go

@@ -66,22 +66,23 @@ func init() {
 	// functions. Errors are propagated up by Parse() and the resulting AST can
 	// functions. Errors are propagated up by Parse() and the resulting AST can
 	// be incorporated directly into the existing AST as a next.
 	// be incorporated directly into the existing AST as a next.
 	dispatch = map[string]func(string) (*Node, map[string]bool, error){
 	dispatch = map[string]func(string) (*Node, map[string]bool, error){
-		command.User:       parseString,
-		command.Onbuild:    parseSubCommand,
-		command.Workdir:    parseString,
-		command.Env:        parseEnv,
-		command.Label:      parseLabel,
-		command.Maintainer: parseString,
-		command.From:       parseString,
-		command.Add:        parseMaybeJSONToList,
-		command.Copy:       parseMaybeJSONToList,
-		command.Run:        parseMaybeJSON,
-		command.Cmd:        parseMaybeJSON,
-		command.Entrypoint: parseMaybeJSON,
-		command.Expose:     parseStringsWhitespaceDelimited,
-		command.Volume:     parseMaybeJSONToList,
-		command.StopSignal: parseString,
-		command.Arg:        parseNameOrNameVal,
+		command.User:        parseString,
+		command.Onbuild:     parseSubCommand,
+		command.Workdir:     parseString,
+		command.Env:         parseEnv,
+		command.Label:       parseLabel,
+		command.Maintainer:  parseString,
+		command.From:        parseString,
+		command.Add:         parseMaybeJSONToList,
+		command.Copy:        parseMaybeJSONToList,
+		command.Run:         parseMaybeJSON,
+		command.Cmd:         parseMaybeJSON,
+		command.Entrypoint:  parseMaybeJSON,
+		command.Expose:      parseStringsWhitespaceDelimited,
+		command.Volume:      parseMaybeJSONToList,
+		command.StopSignal:  parseString,
+		command.Arg:         parseNameOrNameVal,
+		command.Healthcheck: parseHealthConfig,
 	}
 	}
 }
 }
 
 

+ 10 - 0
builder/dockerfile/parser/testfiles/health/Dockerfile

@@ -0,0 +1,10 @@
+FROM debian
+ADD check.sh main.sh /app/
+CMD /app/main.sh
+HEALTHCHECK
+HEALTHCHECK --interval=5s --timeout=3s --retries=1 \
+  CMD /app/check.sh --quiet
+HEALTHCHECK CMD
+HEALTHCHECK   CMD   a b
+HEALTHCHECK --timeout=3s CMD ["foo"]
+HEALTHCHECK CONNECT TCP 7000

+ 9 - 0
builder/dockerfile/parser/testfiles/health/result

@@ -0,0 +1,9 @@
+(from "debian")
+(add "check.sh" "main.sh" "/app/")
+(cmd "/app/main.sh")
+(healthcheck)
+(healthcheck ["--interval=5s" "--timeout=3s" "--retries=1"] "CMD" "/app/check.sh --quiet")
+(healthcheck "CMD")
+(healthcheck "CMD" "a b")
+(healthcheck ["--timeout=3s"] "CMD" "foo")
+(healthcheck "CONNECT" "TCP 7000")

+ 49 - 0
container/health.go

@@ -0,0 +1,49 @@
+package container
+
+import (
+	"github.com/Sirupsen/logrus"
+	"github.com/docker/engine-api/types"
+)
+
+// Health holds the current container health-check state
+type Health struct {
+	types.Health
+	stop chan struct{} // Write struct{} to stop the monitor
+}
+
+// String returns a human-readable description of the health-check state
+func (s *Health) String() string {
+	if s.stop == nil {
+		return "no healthcheck"
+	}
+	switch s.Status {
+	case types.Starting:
+		return "health: starting"
+	default: // Healthy and Unhealthy are clear on their own
+		return s.Status
+	}
+}
+
+// OpenMonitorChannel creates and returns a new monitor channel. If there already is one,
+// it returns nil.
+func (s *Health) OpenMonitorChannel() chan struct{} {
+	if s.stop == nil {
+		logrus.Debugf("OpenMonitorChannel")
+		s.stop = make(chan struct{})
+		return s.stop
+	}
+	return nil
+}
+
+// CloseMonitorChannel closes any existing monitor channel.
+func (s *Health) CloseMonitorChannel() {
+	if s.stop != nil {
+		logrus.Debugf("CloseMonitorChannel: waiting for probe to stop")
+		// This channel does not buffer. Once the write succeeds, the monitor
+		// has read the stop request and will not make any further updates
+		// to c.State.Health.
+		s.stop <- struct{}{}
+		s.stop = nil
+		logrus.Debugf("CloseMonitorChannel done")
+	}
+}

+ 4 - 0
container/state.go

@@ -27,6 +27,7 @@ type State struct {
 	StartedAt         time.Time
 	StartedAt         time.Time
 	FinishedAt        time.Time
 	FinishedAt        time.Time
 	waitChan          chan struct{}
 	waitChan          chan struct{}
+	Health            *Health
 }
 }
 
 
 // NewState creates a default state object with a fresh channel for state changes.
 // NewState creates a default state object with a fresh channel for state changes.
@@ -46,6 +47,9 @@ func (s *State) String() string {
 			return fmt.Sprintf("Restarting (%d) %s ago", s.ExitCode, units.HumanDuration(time.Now().UTC().Sub(s.FinishedAt)))
 			return fmt.Sprintf("Restarting (%d) %s ago", s.ExitCode, units.HumanDuration(time.Now().UTC().Sub(s.FinishedAt)))
 		}
 		}
 
 
+		if h := s.Health; h != nil {
+			return fmt.Sprintf("Up %s (%s)", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)), h.String())
+		}
 		return fmt.Sprintf("Up %s", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)))
 		return fmt.Sprintf("Up %s", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)))
 	}
 	}
 
 

+ 19 - 0
daemon/commit.go

@@ -80,6 +80,25 @@ func merge(userConf, imageConf *containertypes.Config) error {
 			userConf.Entrypoint = imageConf.Entrypoint
 			userConf.Entrypoint = imageConf.Entrypoint
 		}
 		}
 	}
 	}
+	if imageConf.Healthcheck != nil {
+		if userConf.Healthcheck == nil {
+			userConf.Healthcheck = imageConf.Healthcheck
+		} else {
+			if len(userConf.Healthcheck.Test) == 0 {
+				userConf.Healthcheck.Test = imageConf.Healthcheck.Test
+			}
+			if userConf.Healthcheck.Interval == 0 {
+				userConf.Healthcheck.Interval = imageConf.Healthcheck.Interval
+			}
+			if userConf.Healthcheck.Timeout == 0 {
+				userConf.Healthcheck.Timeout = imageConf.Healthcheck.Timeout
+			}
+			if userConf.Healthcheck.Retries == 0 {
+				userConf.Healthcheck.Retries = imageConf.Healthcheck.Retries
+			}
+		}
+	}
+
 	if userConf.WorkingDir == "" {
 	if userConf.WorkingDir == "" {
 		userConf.WorkingDir = imageConf.WorkingDir
 		userConf.WorkingDir = imageConf.WorkingDir
 	}
 	}

+ 23 - 5
daemon/exec.go

@@ -14,11 +14,15 @@ import (
 	"github.com/docker/docker/errors"
 	"github.com/docker/docker/errors"
 	"github.com/docker/docker/libcontainerd"
 	"github.com/docker/docker/libcontainerd"
 	"github.com/docker/docker/pkg/pools"
 	"github.com/docker/docker/pkg/pools"
+	"github.com/docker/docker/pkg/signal"
 	"github.com/docker/docker/pkg/term"
 	"github.com/docker/docker/pkg/term"
 	"github.com/docker/engine-api/types"
 	"github.com/docker/engine-api/types"
 	"github.com/docker/engine-api/types/strslice"
 	"github.com/docker/engine-api/types/strslice"
 )
 )
 
 
+// Seconds to wait after sending TERM before trying KILL
+const termProcessTimeout = 10
+
 func (d *Daemon) registerExecCommand(container *container.Container, config *exec.Config) {
 func (d *Daemon) registerExecCommand(container *container.Container, config *exec.Config) {
 	// Storing execs in container in order to kill them gracefully whenever the container is stopped or removed.
 	// Storing execs in container in order to kill them gracefully whenever the container is stopped or removed.
 	container.ExecCommands.Add(config.ID, config)
 	container.ExecCommands.Add(config.ID, config)
@@ -130,7 +134,8 @@ func (d *Daemon) ContainerExecCreate(name string, config *types.ExecConfig) (str
 
 
 // ContainerExecStart starts a previously set up exec instance. The
 // ContainerExecStart starts a previously set up exec instance. The
 // std streams are set up.
 // std streams are set up.
-func (d *Daemon) ContainerExecStart(name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) (err error) {
+// If ctx is cancelled, the process is terminated.
+func (d *Daemon) ContainerExecStart(ctx context.Context, name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) (err error) {
 	var (
 	var (
 		cStdin           io.ReadCloser
 		cStdin           io.ReadCloser
 		cStdout, cStderr io.Writer
 		cStdout, cStderr io.Writer
@@ -197,15 +202,28 @@ func (d *Daemon) ContainerExecStart(name string, stdin io.ReadCloser, stdout io.
 		return nil
 		return nil
 	}
 	}
 
 
-	attachErr := container.AttachStreams(context.Background(), ec.StreamConfig, ec.OpenStdin, true, ec.Tty, cStdin, cStdout, cStderr, ec.DetachKeys)
+	attachErr := container.AttachStreams(ctx, ec.StreamConfig, ec.OpenStdin, true, ec.Tty, cStdin, cStdout, cStderr, ec.DetachKeys)
 
 
 	if err := d.containerd.AddProcess(c.ID, name, p); err != nil {
 	if err := d.containerd.AddProcess(c.ID, name, p); err != nil {
 		return err
 		return err
 	}
 	}
 
 
-	err = <-attachErr
-	if err != nil {
-		return fmt.Errorf("attach failed with error: %v", err)
+	select {
+	case <-ctx.Done():
+		logrus.Debugf("Sending TERM signal to process %v in container %v", name, c.ID)
+		d.containerd.SignalProcess(c.ID, name, int(signal.SignalMap["TERM"]))
+		select {
+		case <-time.After(termProcessTimeout * time.Second):
+			logrus.Infof("Container %v, process %v failed to exit within %d seconds of signal TERM - using the force", c.ID, name, termProcessTimeout)
+			d.containerd.SignalProcess(c.ID, name, int(signal.SignalMap["KILL"]))
+		case <-attachErr:
+			// TERM signal worked
+		}
+		return fmt.Errorf("context cancelled")
+	case err := <-attachErr:
+		if err != nil {
+			return fmt.Errorf("attach failed with error: %v", err)
+		}
 	}
 	}
 	return nil
 	return nil
 }
 }

+ 314 - 0
daemon/health.go

@@ -0,0 +1,314 @@
+package daemon
+
+import (
+	"bytes"
+	"fmt"
+	"runtime"
+	"strings"
+	"time"
+
+	"golang.org/x/net/context"
+
+	"github.com/Sirupsen/logrus"
+	"github.com/docker/docker/container"
+	"github.com/docker/docker/daemon/exec"
+	"github.com/docker/engine-api/types"
+	"github.com/docker/engine-api/types/strslice"
+)
+
+const (
+	// Longest healthcheck probe output message to store. Longer messages will be truncated.
+	maxOutputLen = 4096
+
+	// Default interval between probe runs (from the end of the first to the start of the second).
+	// Also the time before the first probe.
+	defaultProbeInterval = 30 * time.Second
+
+	// The maximum length of time a single probe run should take. If the probe takes longer
+	// than this, the check is considered to have failed.
+	defaultProbeTimeout = 30 * time.Second
+
+	// Shut down a container if it becomes Unhealthy.
+	defaultExitOnUnhealthy = true
+
+	// Maximum number of entries to record
+	maxLogEntries = 5
+)
+
+const (
+	// Exit status codes that can be returned by the probe command.
+
+	exitStatusHealthy   = 0 // Container is healthy
+	exitStatusUnhealthy = 1 // Container is unhealthy
+	exitStatusStarting  = 2 // Container needs more time to start
+)
+
+// probe implementations know how to run a particular type of probe.
+type probe interface {
+	// Perform one run of the check. Returns the exit code and an optional
+	// short diagnostic string.
+	run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
+}
+
+// cmdProbe implements the "CMD" probe type.
+type cmdProbe struct {
+	// Run the command with the system's default shell instead of execing it directly.
+	shell bool
+}
+
+// exec the healthcheck command in the container.
+// Returns the exit code and probe output (if any)
+func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) {
+	cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:]
+	if p.shell {
+		if runtime.GOOS != "windows" {
+			cmdSlice = append([]string{"/bin/sh", "-c"}, cmdSlice...)
+		} else {
+			cmdSlice = append([]string{"cmd", "/S", "/C"}, cmdSlice...)
+		}
+	}
+	entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
+	execConfig := exec.NewConfig()
+	execConfig.OpenStdin = false
+	execConfig.OpenStdout = true
+	execConfig.OpenStderr = true
+	execConfig.ContainerID = container.ID
+	execConfig.DetachKeys = []byte{}
+	execConfig.Entrypoint = entrypoint
+	execConfig.Args = args
+	execConfig.Tty = false
+	execConfig.Privileged = false
+	execConfig.User = container.Config.User
+
+	d.registerExecCommand(container, execConfig)
+	d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
+
+	output := &limitedBuffer{}
+	err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
+	if err != nil {
+		return nil, err
+	}
+	info, err := d.getExecConfig(execConfig.ID)
+	if err != nil {
+		return nil, err
+	}
+	if info.ExitCode == nil {
+		return nil, fmt.Errorf("Healthcheck has no exit code!")
+	}
+	// Note: Go's json package will handle invalid UTF-8 for us
+	out := output.String()
+	return &types.HealthcheckResult{
+		End:      time.Now(),
+		ExitCode: *info.ExitCode,
+		Output:   out,
+	}, nil
+}
+
+// Update the container's Status.Health struct based on the latest probe's result.
+func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) {
+	c.Lock()
+	defer c.Unlock()
+
+	retries := c.Config.Healthcheck.Retries
+	if retries <= 0 {
+		retries = 1 // Default if unset or set to an invalid value
+	}
+
+	h := c.State.Health
+	oldStatus := h.Status
+
+	if len(h.Log) >= maxLogEntries {
+		h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
+	} else {
+		h.Log = append(h.Log, result)
+	}
+
+	if result.ExitCode == exitStatusHealthy {
+		h.FailingStreak = 0
+		h.Status = types.Healthy
+	} else if result.ExitCode == exitStatusStarting && c.State.Health.Status == types.Starting {
+		// The container is not ready yet. Remain in the starting state.
+	} else {
+		// Failure (incuding invalid exit code)
+		h.FailingStreak++
+		if c.State.Health.FailingStreak >= retries {
+			h.Status = types.Unhealthy
+		}
+		// Else we're starting or healthy. Stay in that state.
+	}
+
+	if oldStatus != h.Status {
+		d.LogContainerEvent(c, "health_status: "+h.Status)
+	}
+}
+
+// Run the container's monitoring thread until notified via "stop".
+// There is never more than one monitor thread running per container at a time.
+func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
+	probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
+	probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
+	for {
+		select {
+		case <-stop:
+			logrus.Debugf("Stop healthcheck monitoring (received while idle)")
+			return
+		case <-time.After(probeInterval):
+			logrus.Debugf("Running health check...")
+			startTime := time.Now()
+			ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
+			results := make(chan *types.HealthcheckResult)
+			go func() {
+				result, err := probe.run(ctx, d, c)
+				if err != nil {
+					logrus.Warnf("Health check error: %v", err)
+					results <- &types.HealthcheckResult{
+						ExitCode: -1,
+						Output:   err.Error(),
+						Start:    startTime,
+						End:      time.Now(),
+					}
+				} else {
+					result.Start = startTime
+					logrus.Debugf("Health check done (exitCode=%d)", result.ExitCode)
+					results <- result
+				}
+				close(results)
+			}()
+			select {
+			case <-stop:
+				logrus.Debugf("Stop healthcheck monitoring (received while probing)")
+				// Stop timeout and kill probe, but don't wait for probe to exit.
+				cancelProbe()
+				return
+			case result := <-results:
+				handleProbeResult(d, c, result)
+				// Stop timeout
+				cancelProbe()
+			case <-ctx.Done():
+				logrus.Debugf("Health check taking too long")
+				handleProbeResult(d, c, &types.HealthcheckResult{
+					ExitCode: -1,
+					Output:   fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
+					Start:    startTime,
+					End:      time.Now(),
+				})
+				cancelProbe()
+				// Wait for probe to exit (it might take a while to respond to the TERM
+				// signal and we don't want dying probes to pile up).
+				<-results
+			}
+		}
+	}
+}
+
+// Get a suitable probe implementation for the container's healthcheck configuration.
+func getProbe(c *container.Container) probe {
+	config := c.Config.Healthcheck
+	if config == nil || len(config.Test) == 0 {
+		return nil
+	}
+	switch config.Test[0] {
+	case "CMD":
+		return &cmdProbe{shell: false}
+	case "CMD-SHELL":
+		return &cmdProbe{shell: true}
+	default:
+		logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD')", config.Test[0])
+		return nil
+	}
+}
+
+// Ensure the health-check monitor is running or not, depending on the current
+// state of the container.
+// Called from monitor.go, with c locked.
+func (d *Daemon) updateHealthMonitor(c *container.Container) {
+	h := c.State.Health
+	if h == nil {
+		return // No healthcheck configured
+	}
+
+	probe := getProbe(c)
+	wantRunning := c.Running && !c.Paused && probe != nil
+	if wantRunning {
+		if stop := h.OpenMonitorChannel(); stop != nil {
+			go monitor(d, c, stop, probe)
+		}
+	} else {
+		h.CloseMonitorChannel()
+	}
+}
+
+// Reset the health state for a newly-started, restarted or restored container.
+// initHealthMonitor is called from monitor.go and we should never be running
+// two instances at once.
+// Called with c locked.
+func (d *Daemon) initHealthMonitor(c *container.Container) {
+	if c.Config.Healthcheck == nil {
+		return
+	}
+
+	// This is needed in case we're auto-restarting
+	d.stopHealthchecks(c)
+
+	if c.State.Health == nil {
+		h := &container.Health{}
+		h.Status = types.Starting
+		h.FailingStreak = 0
+		c.State.Health = h
+	}
+
+	d.updateHealthMonitor(c)
+}
+
+// Called when the container is being stopped (whether because the health check is
+// failing or for any other reason).
+func (d *Daemon) stopHealthchecks(c *container.Container) {
+	h := c.State.Health
+	if h != nil {
+		h.CloseMonitorChannel()
+	}
+}
+
+// Buffer up to maxOutputLen bytes. Further data is discarded.
+type limitedBuffer struct {
+	buf       bytes.Buffer
+	truncated bool // indicates that data has been lost
+}
+
+// Append to limitedBuffer while there is room.
+func (b *limitedBuffer) Write(data []byte) (int, error) {
+	bufLen := b.buf.Len()
+	dataLen := len(data)
+	keep := min(maxOutputLen-bufLen, dataLen)
+	if keep > 0 {
+		b.buf.Write(data[:keep])
+	}
+	if keep < dataLen {
+		b.truncated = true
+	}
+	return dataLen, nil
+}
+
+// The contents of the buffer, with "..." appended if it overflowed.
+func (b *limitedBuffer) String() string {
+	out := b.buf.String()
+	if b.truncated {
+		out = out + "..."
+	}
+	return out
+}
+
+// If configuredValue is zero, use defaultValue instead.
+func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
+	if configuredValue == 0 {
+		return defaultValue
+	}
+	return configuredValue
+}
+
+func min(x, y int) int {
+	if x < y {
+		return x
+	}
+	return y
+}

+ 112 - 0
daemon/health_test.go

@@ -0,0 +1,112 @@
+package daemon
+
+import (
+	"testing"
+	"time"
+
+	"github.com/docker/docker/container"
+	"github.com/docker/docker/daemon/events"
+	"github.com/docker/engine-api/types"
+	containertypes "github.com/docker/engine-api/types/container"
+	eventtypes "github.com/docker/engine-api/types/events"
+)
+
+func reset(c *container.Container) {
+	c.State = &container.State{}
+	c.State.Health = &container.Health{}
+	c.State.Health.Status = types.Starting
+}
+
+func TestHealthStates(t *testing.T) {
+	e := events.New()
+	_, l, _ := e.Subscribe()
+	defer e.Evict(l)
+
+	expect := func(expected string) {
+		select {
+		case event := <-l:
+			ev := event.(eventtypes.Message)
+			if ev.Status != expected {
+				t.Errorf("Expecting event %#v, but got %#v\n", expected, ev.Status)
+			}
+		case <-time.After(1 * time.Second):
+			t.Errorf("Expecting event %#v, but got nothing\n", expected)
+		}
+	}
+
+	c := &container.Container{
+		CommonContainer: container.CommonContainer{
+			ID:   "container_id",
+			Name: "container_name",
+			Config: &containertypes.Config{
+				Image: "image_name",
+			},
+		},
+	}
+	daemon := &Daemon{
+		EventsService: e,
+	}
+
+	c.Config.Healthcheck = &containertypes.HealthConfig{
+		Retries: 1,
+	}
+
+	reset(c)
+
+	handleResult := func(startTime time.Time, exitCode int) {
+		handleProbeResult(daemon, c, &types.HealthcheckResult{
+			Start:    startTime,
+			End:      startTime,
+			ExitCode: exitCode,
+		})
+	}
+
+	// starting -> failed -> success -> failed
+
+	handleResult(c.State.StartedAt.Add(1*time.Second), 1)
+	expect("health_status: unhealthy")
+
+	handleResult(c.State.StartedAt.Add(2*time.Second), 0)
+	expect("health_status: healthy")
+
+	handleResult(c.State.StartedAt.Add(3*time.Second), 1)
+	expect("health_status: unhealthy")
+
+	// starting -> starting -> starting ->
+	// healthy -> starting (invalid transition)
+
+	reset(c)
+
+	handleResult(c.State.StartedAt.Add(20*time.Second), 2)
+	handleResult(c.State.StartedAt.Add(40*time.Second), 2)
+	if c.State.Health.Status != types.Starting {
+		t.Errorf("Expecting starting, but got %#v\n", c.State.Health.Status)
+	}
+
+	handleResult(c.State.StartedAt.Add(50*time.Second), 0)
+	expect("health_status: healthy")
+	handleResult(c.State.StartedAt.Add(60*time.Second), 2)
+	expect("health_status: unhealthy")
+
+	// Test retries
+
+	reset(c)
+	c.Config.Healthcheck.Retries = 3
+
+	handleResult(c.State.StartedAt.Add(20*time.Second), 1)
+	handleResult(c.State.StartedAt.Add(40*time.Second), 1)
+	if c.State.Health.Status != types.Starting {
+		t.Errorf("Expecting starting, but got %#v\n", c.State.Health.Status)
+	}
+	if c.State.Health.FailingStreak != 2 {
+		t.Errorf("Expecting FailingStreak=2, but got %d\n", c.State.Health.FailingStreak)
+	}
+	handleResult(c.State.StartedAt.Add(60*time.Second), 1)
+	expect("health_status: unhealthy")
+
+	handleResult(c.State.StartedAt.Add(80*time.Second), 0)
+	expect("health_status: healthy")
+	if c.State.Health.FailingStreak != 0 {
+		t.Errorf("Expecting FailingStreak=0, but got %d\n", c.State.Health.FailingStreak)
+	}
+}

+ 10 - 0
daemon/inspect.go

@@ -108,6 +108,15 @@ func (daemon *Daemon) getInspectData(container *container.Container, size bool)
 		hostConfig.Links = append(hostConfig.Links, fmt.Sprintf("%s:%s", child.Name, linkAlias))
 		hostConfig.Links = append(hostConfig.Links, fmt.Sprintf("%s:%s", child.Name, linkAlias))
 	}
 	}
 
 
+	var containerHealth *types.Health
+	if container.State.Health != nil {
+		containerHealth = &types.Health{
+			Status:        container.State.Health.Status,
+			FailingStreak: container.State.Health.FailingStreak,
+			Log:           append([]*types.HealthcheckResult{}, container.State.Health.Log...),
+		}
+	}
+
 	containerState := &types.ContainerState{
 	containerState := &types.ContainerState{
 		Status:     container.State.StateString(),
 		Status:     container.State.StateString(),
 		Running:    container.State.Running,
 		Running:    container.State.Running,
@@ -120,6 +129,7 @@ func (daemon *Daemon) getInspectData(container *container.Container, size bool)
 		Error:      container.State.Error,
 		Error:      container.State.Error,
 		StartedAt:  container.State.StartedAt.Format(time.RFC3339Nano),
 		StartedAt:  container.State.StartedAt.Format(time.RFC3339Nano),
 		FinishedAt: container.State.FinishedAt.Format(time.RFC3339Nano),
 		FinishedAt: container.State.FinishedAt.Format(time.RFC3339Nano),
+		Health:     containerHealth,
 	}
 	}
 
 
 	contJSONBase := &types.ContainerJSONBase{
 	contJSONBase := &types.ContainerJSONBase{

+ 9 - 0
daemon/monitor.go

@@ -25,6 +25,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
 		if runtime.GOOS == "windows" {
 		if runtime.GOOS == "windows" {
 			return errors.New("Received StateOOM from libcontainerd on Windows. This should never happen.")
 			return errors.New("Received StateOOM from libcontainerd on Windows. This should never happen.")
 		}
 		}
+		daemon.updateHealthMonitor(c)
 		daemon.LogContainerEvent(c, "oom")
 		daemon.LogContainerEvent(c, "oom")
 	case libcontainerd.StateExit:
 	case libcontainerd.StateExit:
 		c.Lock()
 		c.Lock()
@@ -35,6 +36,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
 		attributes := map[string]string{
 		attributes := map[string]string{
 			"exitCode": strconv.Itoa(int(e.ExitCode)),
 			"exitCode": strconv.Itoa(int(e.ExitCode)),
 		}
 		}
+		daemon.updateHealthMonitor(c)
 		daemon.LogContainerEventWithAttributes(c, "die", attributes)
 		daemon.LogContainerEventWithAttributes(c, "die", attributes)
 		daemon.Cleanup(c)
 		daemon.Cleanup(c)
 		// FIXME: here is race condition between two RUN instructions in Dockerfile
 		// FIXME: here is race condition between two RUN instructions in Dockerfile
@@ -54,6 +56,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
 			"exitCode": strconv.Itoa(int(e.ExitCode)),
 			"exitCode": strconv.Itoa(int(e.ExitCode)),
 		}
 		}
 		daemon.LogContainerEventWithAttributes(c, "die", attributes)
 		daemon.LogContainerEventWithAttributes(c, "die", attributes)
+		daemon.updateHealthMonitor(c)
 		return c.ToDisk()
 		return c.ToDisk()
 	case libcontainerd.StateExitProcess:
 	case libcontainerd.StateExitProcess:
 		c.Lock()
 		c.Lock()
@@ -74,18 +77,24 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
 			logrus.Warnf("Ignoring StateExitProcess for %v but no exec command found", e)
 			logrus.Warnf("Ignoring StateExitProcess for %v but no exec command found", e)
 		}
 		}
 	case libcontainerd.StateStart, libcontainerd.StateRestore:
 	case libcontainerd.StateStart, libcontainerd.StateRestore:
+		// Container is already locked in this case
 		c.SetRunning(int(e.Pid), e.State == libcontainerd.StateStart)
 		c.SetRunning(int(e.Pid), e.State == libcontainerd.StateStart)
 		c.HasBeenManuallyStopped = false
 		c.HasBeenManuallyStopped = false
 		if err := c.ToDisk(); err != nil {
 		if err := c.ToDisk(); err != nil {
 			c.Reset(false)
 			c.Reset(false)
 			return err
 			return err
 		}
 		}
+		daemon.initHealthMonitor(c)
 		daemon.LogContainerEvent(c, "start")
 		daemon.LogContainerEvent(c, "start")
 	case libcontainerd.StatePause:
 	case libcontainerd.StatePause:
+		// Container is already locked in this case
 		c.Paused = true
 		c.Paused = true
+		daemon.updateHealthMonitor(c)
 		daemon.LogContainerEvent(c, "pause")
 		daemon.LogContainerEvent(c, "pause")
 	case libcontainerd.StateResume:
 	case libcontainerd.StateResume:
+		// Container is already locked in this case
 		c.Paused = false
 		c.Paused = false
+		daemon.updateHealthMonitor(c)
 		daemon.LogContainerEvent(c, "unpause")
 		daemon.LogContainerEvent(c, "unpause")
 	}
 	}
 
 

+ 2 - 0
daemon/stop.go

@@ -41,6 +41,8 @@ func (daemon *Daemon) containerStop(container *container.Container, seconds int)
 		return nil
 		return nil
 	}
 	}
 
 
+	daemon.stopHealthchecks(container)
+
 	stopSignal := container.StopSignal()
 	stopSignal := container.StopSignal()
 	// 1. Send a stop signal
 	// 1. Send a stop signal
 	if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil {
 	if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil {

+ 67 - 0
docs/reference/builder.md

@@ -1470,6 +1470,73 @@ The `STOPSIGNAL` instruction sets the system call signal that will be sent to th
 This signal can be a valid unsigned number that matches a position in the kernel's syscall table, for instance 9,
 This signal can be a valid unsigned number that matches a position in the kernel's syscall table, for instance 9,
 or a signal name in the format SIGNAME, for instance SIGKILL.
 or a signal name in the format SIGNAME, for instance SIGKILL.
 
 
+## HEALTHCHECK
+
+The `HEALTHCHECK` instruction has two forms:
+
+* `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container)
+* `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image)
+
+The `HEALTHCHECK` instruction tells Docker how to test a container to check that
+it is still working. This can detect cases such as a web server that is stuck in
+an infinite loop and unable to handle new connections, even though the server
+process is still running.
+
+When a container has a healthcheck specified, it has a _health status_ in
+addition to its normal status. This status is initially `starting`. Whenever a
+health check passes, it becomes `healthy` (whatever state it was previously in).
+After a certain number of consecutive failures, it becomes `unhealthy`.
+
+The options that can appear before `CMD` are:
+
+* `--interval=DURATION` (default: `30s`)
+* `--timeout=DURATION` (default: `30s`)
+* `--retries=N` (default: `1`)
+
+The health check will first run **interval** seconds after the container is
+started, and then again **interval** seconds after each previous check completes.
+
+If a single run of the check takes longer than **timeout** seconds then the check
+is considered to have failed.
+
+It takes **retries** consecutive failures of the health check for the container
+to be considered `unhealthy`.
+
+There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list
+more than one then only the last `HEALTHCHECK` will take effect.
+
+The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK
+CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands;
+see e.g. `ENTRYPOINT` for details).
+
+The command's exit status indicates the health status of the container.
+The possible values are:
+
+- 0: success - the container is healthy and ready for use
+- 1: unhealthy - the container is not working correctly
+- 2: starting - the container is not ready for use yet, but is working correctly
+
+If the probe returns 2 ("starting") when the container has already moved out of the
+"starting" state then it is treated as "unhealthy" instead.
+
+For example, to check every five minutes or so that a web-server is able to
+serve the site's main page within three seconds:
+
+    HEALTHCHECK --interval=5m --timeout=3s \
+      CMD curl -f http://localhost/ || exit 1
+
+To help debug failing probes, any output text (UTF-8 encoded) that the command writes
+on stdout or stderr will be stored in the health status and can be queried with
+`docker inspect`. Such output should be kept short (only the first 4096 bytes
+are stored currently).
+
+When the health status of a container changes, a `health_status` event is
+generated with the new status.
+
+The `HEALTHCHECK` feature was added in Docker 1.12.
+
+
+
 ## Dockerfile examples
 ## Dockerfile examples
 
 
 Below you can see some examples of Dockerfile syntax. If you're interested in
 Below you can see some examples of Dockerfile syntax. If you're interested in

+ 60 - 0
docs/reference/run.md

@@ -1250,6 +1250,7 @@ Dockerfile instruction and how the operator can override that setting.
     #entrypoint-default-command-to-execute-at-runtime)
     #entrypoint-default-command-to-execute-at-runtime)
  - [EXPOSE (Incoming Ports)](#expose-incoming-ports)
  - [EXPOSE (Incoming Ports)](#expose-incoming-ports)
  - [ENV (Environment Variables)](#env-environment-variables)
  - [ENV (Environment Variables)](#env-environment-variables)
+ - [HEALTHCHECK](#healthcheck)
  - [VOLUME (Shared Filesystems)](#volume-shared-filesystems)
  - [VOLUME (Shared Filesystems)](#volume-shared-filesystems)
  - [USER](#user)
  - [USER](#user)
  - [WORKDIR](#workdir)
  - [WORKDIR](#workdir)
@@ -1398,6 +1399,65 @@ above, or already defined by the developer with a Dockerfile `ENV`:
 
 
 Similarly the operator can set the **hostname** with `-h`.
 Similarly the operator can set the **hostname** with `-h`.
 
 
+### HEALTHCHECK
+
+```
+  --health-cmd            Command to run to check health
+  --health-interval       Time between running the check
+  --health-retries        Consecutive failures needed to report unhealthy
+  --health-timeout        Maximum time to allow one check to run
+  --no-healthcheck        Disable any container-specified HEALTHCHECK
+```
+
+Example:
+
+    $ docker run --name=test -d \
+        --health-cmd='stat /etc/passwd || exit 1' \
+        --health-interval=2s \
+        busybox sleep 1d
+    $ sleep 2; docker inspect --format='{{.State.Health.Status}}' test
+    healthy
+    $ docker exec test rm /etc/passwd
+    $ sleep 2; docker inspect --format='{{json .State.Health}}' test
+    {
+      "Status": "unhealthy",
+      "FailingStreak": 3,
+      "Log": [
+        {
+          "Start": "2016-05-25T17:22:04.635478668Z",
+          "End": "2016-05-25T17:22:04.7272552Z",
+          "ExitCode": 0,
+          "Output": "  File: /etc/passwd\n  Size: 334       \tBlocks: 8          IO Block: 4096   regular file\nDevice: 32h/50d\tInode: 12          Links: 1\nAccess: (0664/-rw-rw-r--)  Uid: (    0/    root)   Gid: (    0/    root)\nAccess: 2015-12-05 22:05:32.000000000\nModify: 2015..."
+        },
+        {
+          "Start": "2016-05-25T17:22:06.732900633Z",
+          "End": "2016-05-25T17:22:06.822168935Z",
+          "ExitCode": 0,
+          "Output": "  File: /etc/passwd\n  Size: 334       \tBlocks: 8          IO Block: 4096   regular file\nDevice: 32h/50d\tInode: 12          Links: 1\nAccess: (0664/-rw-rw-r--)  Uid: (    0/    root)   Gid: (    0/    root)\nAccess: 2015-12-05 22:05:32.000000000\nModify: 2015..."
+        },
+        {
+          "Start": "2016-05-25T17:22:08.823956535Z",
+          "End": "2016-05-25T17:22:08.897359124Z",
+          "ExitCode": 1,
+          "Output": "stat: can't stat '/etc/passwd': No such file or directory\n"
+        },
+        {
+          "Start": "2016-05-25T17:22:10.898802931Z",
+          "End": "2016-05-25T17:22:10.969631866Z",
+          "ExitCode": 1,
+          "Output": "stat: can't stat '/etc/passwd': No such file or directory\n"
+        },
+        {
+          "Start": "2016-05-25T17:22:12.971033523Z",
+          "End": "2016-05-25T17:22:13.082015516Z",
+          "ExitCode": 1,
+          "Output": "stat: can't stat '/etc/passwd': No such file or directory\n"
+        }
+      ]
+    }
+
+The health status is also displayed in the `docker ps` output.
+
 ### TMPFS (mount tmpfs filesystems)
 ### TMPFS (mount tmpfs filesystems)
 
 
 ```bash
 ```bash

+ 1 - 1
hack/vendor.sh

@@ -60,7 +60,7 @@ clone git golang.org/x/net 78cb2c067747f08b343f20614155233ab4ea2ad3 https://gith
 clone git golang.org/x/sys eb2c74142fd19a79b3f237334c7384d5167b1b46 https://github.com/golang/sys.git
 clone git golang.org/x/sys eb2c74142fd19a79b3f237334c7384d5167b1b46 https://github.com/golang/sys.git
 clone git github.com/docker/go-units 651fc226e7441360384da338d0fd37f2440ffbe3
 clone git github.com/docker/go-units 651fc226e7441360384da338d0fd37f2440ffbe3
 clone git github.com/docker/go-connections v0.2.0
 clone git github.com/docker/go-connections v0.2.0
-clone git github.com/docker/engine-api 009ba1641d669613b38818f6f6385b0e74c5728f
+clone git github.com/docker/engine-api fa04f66c7871183dd53a5ec666479f49b452743d
 clone git github.com/RackSec/srslog 259aed10dfa74ea2961eddd1d9847619f6e98837
 clone git github.com/RackSec/srslog 259aed10dfa74ea2961eddd1d9847619f6e98837
 clone git github.com/imdario/mergo 0.2.1
 clone git github.com/imdario/mergo 0.2.1
 
 

+ 154 - 0
integration-cli/docker_cli_health_test.go

@@ -0,0 +1,154 @@
+package main
+
+import (
+	"encoding/json"
+	"github.com/docker/docker/pkg/integration/checker"
+	"github.com/docker/engine-api/types"
+	"github.com/go-check/check"
+	"strconv"
+	"strings"
+	"time"
+)
+
+func waitForStatus(c *check.C, name string, prev string, expected string) {
+	prev = prev + "\n"
+	expected = expected + "\n"
+	for {
+		out, _ := dockerCmd(c, "inspect", "--format={{.State.Status}}", name)
+		if out == expected {
+			return
+		}
+		c.Check(out, checker.Equals, prev)
+		if out != prev {
+			return
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+}
+
+func waitForHealthStatus(c *check.C, name string, prev string, expected string) {
+	prev = prev + "\n"
+	expected = expected + "\n"
+	for {
+		out, _ := dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", name)
+		if out == expected {
+			return
+		}
+		c.Check(out, checker.Equals, prev)
+		if out != prev {
+			return
+		}
+		time.Sleep(100 * time.Millisecond)
+	}
+}
+
+func getHealth(c *check.C, name string) *types.Health {
+	out, _ := dockerCmd(c, "inspect", "--format={{json .State.Health}}", name)
+	var health types.Health
+	err := json.Unmarshal([]byte(out), &health)
+	c.Check(err, checker.Equals, nil)
+	return &health
+}
+
+func (s *DockerSuite) TestHealth(c *check.C) {
+	testRequires(c, DaemonIsLinux) // busybox doesn't work on Windows
+
+	imageName := "testhealth"
+	_, err := buildImage(imageName,
+		`FROM busybox
+		RUN echo OK > /status
+		CMD ["/bin/sleep", "120"]
+		STOPSIGNAL SIGKILL
+		HEALTHCHECK --interval=1s --timeout=30s \
+		  CMD cat /status`,
+		true)
+
+	c.Check(err, check.IsNil)
+
+	// No health status before starting
+	name := "test_health"
+	dockerCmd(c, "create", "--name", name, imageName)
+	out, _ := dockerCmd(c, "ps", "-a", "--format={{.Status}}")
+	c.Check(out, checker.Equals, "Created\n")
+
+	// Inspect the options
+	out, _ = dockerCmd(c, "inspect",
+		"--format='timeout={{.Config.Healthcheck.Timeout}} "+
+			"interval={{.Config.Healthcheck.Interval}} "+
+			"retries={{.Config.Healthcheck.Retries}} "+
+			"test={{.Config.Healthcheck.Test}}'", name)
+	c.Check(out, checker.Equals, "timeout=30s interval=1s retries=0 test=[CMD-SHELL cat /status]\n")
+
+	// Start
+	dockerCmd(c, "start", name)
+	waitForHealthStatus(c, name, "starting", "healthy")
+
+	// Make it fail
+	dockerCmd(c, "exec", name, "rm", "/status")
+	waitForHealthStatus(c, name, "healthy", "unhealthy")
+
+	// Inspect the status
+	out, _ = dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", name)
+	c.Check(out, checker.Equals, "unhealthy\n")
+
+	// Make it healthy again
+	dockerCmd(c, "exec", name, "touch", "/status")
+	waitForHealthStatus(c, name, "unhealthy", "healthy")
+
+	// Remove container
+	dockerCmd(c, "rm", "-f", name)
+
+	// Disable the check from the CLI
+	out, _ = dockerCmd(c, "create", "--name=noh", "--no-healthcheck", imageName)
+	out, _ = dockerCmd(c, "inspect", "--format={{.Config.Healthcheck.Test}}", "noh")
+	c.Check(out, checker.Equals, "[NONE]\n")
+	dockerCmd(c, "rm", "noh")
+
+	// Disable the check with a new build
+	_, err = buildImage("no_healthcheck",
+		`FROM testhealth
+		HEALTHCHECK NONE`, true)
+	c.Check(err, check.IsNil)
+
+	out, _ = dockerCmd(c, "inspect", "--format={{.ContainerConfig.Healthcheck.Test}}", "no_healthcheck")
+	c.Check(out, checker.Equals, "[NONE]\n")
+
+	// Enable the checks from the CLI
+	_, _ = dockerCmd(c, "run", "-d", "--name=fatal_healthcheck",
+		"--health-interval=0.5s",
+		"--health-retries=3",
+		"--health-cmd=cat /status",
+		"no_healthcheck")
+	waitForHealthStatus(c, "fatal_healthcheck", "starting", "healthy")
+	health := getHealth(c, "fatal_healthcheck")
+	c.Check(health.Status, checker.Equals, "healthy")
+	c.Check(health.FailingStreak, checker.Equals, 0)
+	last := health.Log[len(health.Log)-1]
+	c.Check(last.ExitCode, checker.Equals, 0)
+	c.Check(last.Output, checker.Equals, "OK\n")
+
+	// Fail the check, which should now make it exit
+	dockerCmd(c, "exec", "fatal_healthcheck", "rm", "/status")
+	waitForStatus(c, "fatal_healthcheck", "running", "exited")
+
+	out, _ = dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", "fatal_healthcheck")
+	c.Check(out, checker.Equals, "unhealthy\n")
+	failsStr, _ := dockerCmd(c, "inspect", "--format={{.State.Health.FailingStreak}}", "fatal_healthcheck")
+	fails, err := strconv.Atoi(strings.TrimSpace(failsStr))
+	c.Check(err, check.IsNil)
+	c.Check(fails >= 3, checker.Equals, true)
+	dockerCmd(c, "rm", "-f", "fatal_healthcheck")
+
+	// Check timeout
+	// Note: if the interval is too small, it seems that Docker spends all its time running health
+	// checks and never gets around to killing it.
+	_, _ = dockerCmd(c, "run", "-d", "--name=test",
+		"--health-interval=1s", "--health-cmd=sleep 5m", "--health-timeout=1ms", imageName)
+	waitForHealthStatus(c, "test", "starting", "unhealthy")
+	health = getHealth(c, "test")
+	last = health.Log[len(health.Log)-1]
+	c.Check(health.Status, checker.Equals, "unhealthy")
+	c.Check(last.ExitCode, checker.Equals, -1)
+	c.Check(last.Output, checker.Equals, "Health check exceeded timeout (1ms)")
+	dockerCmd(c, "rm", "-f", "test")
+}

+ 11 - 0
libcontainerd/client_linux.go

@@ -190,6 +190,17 @@ func (clnt *client) Signal(containerID string, sig int) error {
 	return err
 	return err
 }
 }
 
 
+func (clnt *client) SignalProcess(containerID string, pid string, sig int) error {
+	clnt.lock(containerID)
+	defer clnt.unlock(containerID)
+	_, err := clnt.remote.apiClient.Signal(context.Background(), &containerd.SignalRequest{
+		Id:     containerID,
+		Pid:    pid,
+		Signal: uint32(sig),
+	})
+	return err
+}
+
 func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error {
 func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error {
 	clnt.lock(containerID)
 	clnt.lock(containerID)
 	defer clnt.unlock(containerID)
 	defer clnt.unlock(containerID)

+ 19 - 0
libcontainerd/client_windows.go

@@ -304,6 +304,25 @@ func (clnt *client) Signal(containerID string, sig int) error {
 	return nil
 	return nil
 }
 }
 
 
+// While Linux has support for the full range of signals, signals aren't really implemented on Windows.
+// We try to terminate the specified process whatever signal is requested.
+func (clnt *client) SignalProcess(containerID string, processFriendlyName string, sig int) error {
+	clnt.lock(containerID)
+	defer clnt.unlock(containerID)
+	cont, err := clnt.getContainer(containerID)
+	if err != nil {
+		return err
+	}
+
+	for _, p := range cont.processes {
+		if p.friendlyName == processFriendlyName {
+			return hcsshim.TerminateProcessInComputeSystem(containerID, p.systemPid)
+		}
+	}
+
+	return fmt.Errorf("SignalProcess could not find process %s in %s", processFriendlyName, containerID)
+}
+
 // Resize handles a CLI event to resize an interactive docker run or docker exec
 // Resize handles a CLI event to resize an interactive docker run or docker exec
 // window.
 // window.
 func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error {
 func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error {

+ 1 - 0
libcontainerd/types.go

@@ -34,6 +34,7 @@ type Backend interface {
 type Client interface {
 type Client interface {
 	Create(containerID string, spec Spec, options ...CreateOption) error
 	Create(containerID string, spec Spec, options ...CreateOption) error
 	Signal(containerID string, sig int) error
 	Signal(containerID string, sig int) error
+	SignalProcess(containerID string, processFriendlyName string, sig int) error
 	AddProcess(containerID, processFriendlyName string, process Process) error
 	AddProcess(containerID, processFriendlyName string, process Process) error
 	Resize(containerID, processFriendlyName string, width, height int) error
 	Resize(containerID, processFriendlyName string, width, height int) error
 	Pause(containerID string) error
 	Pause(containerID string) error

+ 40 - 0
runconfig/opts/parse.go

@@ -100,6 +100,12 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
 		flStopSignal        = cmd.String([]string{"-stop-signal"}, signal.DefaultStopSignal, fmt.Sprintf("Signal to stop a container, %v by default", signal.DefaultStopSignal))
 		flStopSignal        = cmd.String([]string{"-stop-signal"}, signal.DefaultStopSignal, fmt.Sprintf("Signal to stop a container, %v by default", signal.DefaultStopSignal))
 		flIsolation         = cmd.String([]string{"-isolation"}, "", "Container isolation technology")
 		flIsolation         = cmd.String([]string{"-isolation"}, "", "Container isolation technology")
 		flShmSize           = cmd.String([]string{"-shm-size"}, "", "Size of /dev/shm, default value is 64MB")
 		flShmSize           = cmd.String([]string{"-shm-size"}, "", "Size of /dev/shm, default value is 64MB")
+		// Healthcheck
+		flNoHealthcheck  = cmd.Bool([]string{"-no-healthcheck"}, false, "Disable any container-specified HEALTHCHECK")
+		flHealthCmd      = cmd.String([]string{"-health-cmd"}, "", "Command to run to check health")
+		flHealthInterval = cmd.Duration([]string{"-health-interval"}, 0, "Time between running the check")
+		flHealthTimeout  = cmd.Duration([]string{"-health-timeout"}, 0, "Maximum time to allow one check to run")
+		flHealthRetries  = cmd.Int([]string{"-health-retries"}, 0, "Consecutive failures needed to report unhealthy")
 	)
 	)
 
 
 	cmd.Var(&flAttach, []string{"a", "-attach"}, "Attach to STDIN, STDOUT or STDERR")
 	cmd.Var(&flAttach, []string{"a", "-attach"}, "Attach to STDIN, STDOUT or STDERR")
@@ -351,6 +357,39 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
 		return nil, nil, nil, cmd, err
 		return nil, nil, nil, cmd, err
 	}
 	}
 
 
+	// Healthcheck
+	var healthConfig *container.HealthConfig
+	haveHealthSettings := *flHealthCmd != "" ||
+		*flHealthInterval != 0 ||
+		*flHealthTimeout != 0 ||
+		*flHealthRetries != 0
+	if *flNoHealthcheck {
+		if haveHealthSettings {
+			return nil, nil, nil, cmd, fmt.Errorf("--no-healthcheck conflicts with --health-* options")
+		}
+		test := strslice.StrSlice{"NONE"}
+		healthConfig = &container.HealthConfig{Test: test}
+	} else if haveHealthSettings {
+		var probe strslice.StrSlice
+		if *flHealthCmd != "" {
+			args := []string{"CMD-SHELL", *flHealthCmd}
+			probe = strslice.StrSlice(args)
+		}
+		if *flHealthInterval < 0 {
+			return nil, nil, nil, cmd, fmt.Errorf("--health-interval cannot be negative")
+		}
+		if *flHealthTimeout < 0 {
+			return nil, nil, nil, cmd, fmt.Errorf("--health-timeout cannot be negative")
+		}
+
+		healthConfig = &container.HealthConfig{
+			Test:     probe,
+			Interval: *flHealthInterval,
+			Timeout:  *flHealthTimeout,
+			Retries:  *flHealthRetries,
+		}
+	}
+
 	resources := container.Resources{
 	resources := container.Resources{
 		CgroupParent:         *flCgroupParent,
 		CgroupParent:         *flCgroupParent,
 		Memory:               flMemory,
 		Memory:               flMemory,
@@ -399,6 +438,7 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
 		Entrypoint:      entrypoint,
 		Entrypoint:      entrypoint,
 		WorkingDir:      *flWorkingDir,
 		WorkingDir:      *flWorkingDir,
 		Labels:          ConvertKVStringsToMap(labels),
 		Labels:          ConvertKVStringsToMap(labels),
+		Healthcheck:     healthConfig,
 	}
 	}
 	if cmd.IsSet("-stop-signal") {
 	if cmd.IsSet("-stop-signal") {
 		config.StopSignal = *flStopSignal
 		config.StopSignal = *flStopSignal

+ 40 - 0
runconfig/opts/parse_test.go

@@ -9,6 +9,7 @@ import (
 	"runtime"
 	"runtime"
 	"strings"
 	"strings"
 	"testing"
 	"testing"
+	"time"
 
 
 	flag "github.com/docker/docker/pkg/mflag"
 	flag "github.com/docker/docker/pkg/mflag"
 	"github.com/docker/docker/runconfig"
 	"github.com/docker/docker/runconfig"
@@ -584,6 +585,45 @@ func TestParseRestartPolicy(t *testing.T) {
 	}
 	}
 }
 }
 
 
+func TestParseHealth(t *testing.T) {
+	checkOk := func(args ...string) *container.HealthConfig {
+		config, _, _, _, err := parseRun(args)
+		if err != nil {
+			t.Fatalf("%#v: %v", args, err)
+		}
+		return config.Healthcheck
+	}
+	checkError := func(expected string, args ...string) {
+		config, _, _, _, err := parseRun(args)
+		if err == nil {
+			t.Fatalf("Expected error, but got %#v", config)
+		}
+		if err.Error() != expected {
+			t.Fatalf("Expected %#v, got %#v", expected, err)
+		}
+	}
+	health := checkOk("--no-healthcheck", "img", "cmd")
+	if health == nil || len(health.Test) != 1 || health.Test[0] != "NONE" {
+		t.Fatalf("--no-healthcheck failed: %#v", health)
+	}
+
+	health = checkOk("--health-cmd=/check.sh -q", "img", "cmd")
+	if len(health.Test) != 2 || health.Test[0] != "CMD-SHELL" || health.Test[1] != "/check.sh -q" {
+		t.Fatalf("--health-cmd: got %#v", health.Test)
+	}
+	if health.Timeout != 0 {
+		t.Fatalf("--health-cmd: timeout = %f", health.Timeout)
+	}
+
+	checkError("--no-healthcheck conflicts with --health-* options",
+		"--no-healthcheck", "--health-cmd=/check.sh -q", "img", "cmd")
+
+	health = checkOk("--health-timeout=2s", "--health-retries=3", "--health-interval=4.5s", "img", "cmd")
+	if health.Timeout != 2*time.Second || health.Retries != 3 || health.Interval != 4500*time.Millisecond {
+		t.Fatalf("--health-*: got %#v", health)
+	}
+}
+
 func TestParseLoggingOpts(t *testing.T) {
 func TestParseLoggingOpts(t *testing.T) {
 	// logging opts ko
 	// logging opts ko
 	if _, _, _, _, err := parseRun([]string{"--log-driver=none", "--log-opt=anything", "img", "cmd"}); err == nil || err.Error() != "invalid logging opts for driver none" {
 	if _, _, _, _, err := parseRun([]string{"--log-driver=none", "--log-opt=anything", "img", "cmd"}); err == nil || err.Error() != "invalid logging opts for driver none" {

+ 14 - 1
vendor/src/github.com/docker/engine-api/client/request.go

@@ -11,6 +11,7 @@ import (
 	"strings"
 	"strings"
 
 
 	"github.com/docker/engine-api/client/transport/cancellable"
 	"github.com/docker/engine-api/client/transport/cancellable"
+	"github.com/docker/engine-api/types"
 	"golang.org/x/net/context"
 	"golang.org/x/net/context"
 )
 )
 
 
@@ -130,7 +131,19 @@ func (cli *Client) sendClientRequest(ctx context.Context, method, path string, q
 		if len(body) == 0 {
 		if len(body) == 0 {
 			return serverResp, fmt.Errorf("Error: request returned %s for API route and version %s, check if the server supports the requested API version", http.StatusText(serverResp.statusCode), req.URL)
 			return serverResp, fmt.Errorf("Error: request returned %s for API route and version %s, check if the server supports the requested API version", http.StatusText(serverResp.statusCode), req.URL)
 		}
 		}
-		return serverResp, fmt.Errorf("Error response from daemon: %s", bytes.TrimSpace(body))
+
+		var errorMessage string
+		if resp.Header.Get("Content-Type") == "application/json" {
+			var errorResponse types.ErrorResponse
+			if err := json.Unmarshal(body, &errorResponse); err != nil {
+				return serverResp, fmt.Errorf("Error reading JSON: %v", err)
+			}
+			errorMessage = errorResponse.Message
+		} else {
+			errorMessage = string(body)
+		}
+
+		return serverResp, fmt.Errorf("Error response from daemon: %s", strings.TrimSpace(errorMessage))
 	}
 	}
 
 
 	serverResp.body = resp.Body
 	serverResp.body = resp.Body

+ 22 - 0
vendor/src/github.com/docker/engine-api/types/container/config.go

@@ -3,8 +3,29 @@ package container
 import (
 import (
 	"github.com/docker/engine-api/types/strslice"
 	"github.com/docker/engine-api/types/strslice"
 	"github.com/docker/go-connections/nat"
 	"github.com/docker/go-connections/nat"
+	"time"
 )
 )
 
 
+// HealthConfig holds configuration settings for the HEALTHCHECK feature.
+type HealthConfig struct {
+	// Test is the test to perform to check that the container is healthy.
+	// An empty slice means to inherit the default.
+	// The options are:
+	// {} : inherit healthcheck
+	// {"NONE"} : disable healthcheck
+	// {"CMD", args...} : exec arguments directly
+	// {"CMD-SHELL", command} : run command with system's default shell
+	Test []string `json:",omitempty"`
+
+	// Zero means to inherit. Durations are expressed as integer nanoseconds.
+	Interval time.Duration `json:",omitempty"` // Interval is the time to wait between checks.
+	Timeout  time.Duration `json:",omitempty"` // Timeout is the time to wait before considering the check to have hung.
+
+	// Retries is the number of consecutive failures needed to consider a container as unhealthy.
+	// Zero means inherit.
+	Retries int `json:",omitempty"`
+}
+
 // Config contains the configuration data about a container.
 // Config contains the configuration data about a container.
 // It should hold only portable information about the container.
 // It should hold only portable information about the container.
 // Here, "portable" means "independent from the host we are running on".
 // Here, "portable" means "independent from the host we are running on".
@@ -24,6 +45,7 @@ type Config struct {
 	StdinOnce       bool                  // If true, close stdin after the 1 attached client disconnects.
 	StdinOnce       bool                  // If true, close stdin after the 1 attached client disconnects.
 	Env             []string              // List of environment variable to set in the container
 	Env             []string              // List of environment variable to set in the container
 	Cmd             strslice.StrSlice     // Command to run when starting the container
 	Cmd             strslice.StrSlice     // Command to run when starting the container
+	Healthcheck     *HealthConfig         `json:",omitempty"` // Healthcheck describes how to check the container is healthy
 	ArgsEscaped     bool                  `json:",omitempty"` // True if command is already escaped (Windows specific)
 	ArgsEscaped     bool                  `json:",omitempty"` // True if command is already escaped (Windows specific)
 	Image           string                // Name of the image as it was passed by the operator (eg. could be symbolic)
 	Image           string                // Name of the image as it was passed by the operator (eg. could be symbolic)
 	Volumes         map[string]struct{}   // List of volumes (mounts) used for the container
 	Volumes         map[string]struct{}   // List of volumes (mounts) used for the container

+ 6 - 0
vendor/src/github.com/docker/engine-api/types/errors.go

@@ -0,0 +1,6 @@
+package types
+
+// ErrorResponse is the response body of API errors.
+type ErrorResponse struct {
+	Message string `json:"message"`
+}

+ 23 - 0
vendor/src/github.com/docker/engine-api/types/types.go

@@ -276,6 +276,28 @@ type ExecStartCheck struct {
 	Tty bool
 	Tty bool
 }
 }
 
 
+// HealthcheckResult stores information about a single run of a healthcheck probe
+type HealthcheckResult struct {
+	Start    time.Time // Start is the time this check started
+	End      time.Time // End is the time this check ended
+	ExitCode int       // ExitCode meanings: 0=healthy, 1=unhealthy, 2=starting, else=error running probe
+	Output   string    // Output from last check
+}
+
+// Health states
+const (
+	Starting  = "starting"  // Starting indicates that the container is not yet ready
+	Healthy   = "healthy"   // Healthy indicates that the container is running correctly
+	Unhealthy = "unhealthy" // Unhealthy indicates that the container has a problem
+)
+
+// Health stores information about the container's healthcheck results
+type Health struct {
+	Status        string               // Status is one of Starting, Healthy or Unhealthy
+	FailingStreak int                  // FailingStreak is the number of consecutive failures
+	Log           []*HealthcheckResult // Log contains the last few results (oldest first)
+}
+
 // ContainerState stores container's running state
 // ContainerState stores container's running state
 // it's part of ContainerJSONBase and will return by "inspect" command
 // it's part of ContainerJSONBase and will return by "inspect" command
 type ContainerState struct {
 type ContainerState struct {
@@ -290,6 +312,7 @@ type ContainerState struct {
 	Error      string
 	Error      string
 	StartedAt  string
 	StartedAt  string
 	FinishedAt string
 	FinishedAt string
+	Health     *Health `json:",omitempty"`
 }
 }
 
 
 // ContainerNode stores information about the node that a container
 // ContainerNode stores information about the node that a container