123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313 |
- package daemon
- import (
- "bytes"
- "fmt"
- "runtime"
- "strings"
- "time"
- "golang.org/x/net/context"
- "github.com/Sirupsen/logrus"
- "github.com/docker/docker/container"
- "github.com/docker/docker/daemon/exec"
- "github.com/docker/engine-api/types"
- "github.com/docker/engine-api/types/strslice"
- )
- const (
- // Longest healthcheck probe output message to store. Longer messages will be truncated.
- maxOutputLen = 4096
- // Default interval between probe runs (from the end of the first to the start of the second).
- // Also the time before the first probe.
- defaultProbeInterval = 30 * time.Second
- // The maximum length of time a single probe run should take. If the probe takes longer
- // than this, the check is considered to have failed.
- defaultProbeTimeout = 30 * time.Second
- // Default number of consecutive failures of the health check
- // for the container to be considered unhealthy.
- defaultProbeRetries = 3
- // Maximum number of entries to record
- maxLogEntries = 5
- )
- const (
- // Exit status codes that can be returned by the probe command.
- exitStatusHealthy = 0 // Container is healthy
- exitStatusUnhealthy = 1 // Container is unhealthy
- )
- // probe implementations know how to run a particular type of probe.
- type probe interface {
- // Perform one run of the check. Returns the exit code and an optional
- // short diagnostic string.
- run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
- }
- // cmdProbe implements the "CMD" probe type.
- type cmdProbe struct {
- // Run the command with the system's default shell instead of execing it directly.
- shell bool
- }
- // exec the healthcheck command in the container.
- // Returns the exit code and probe output (if any)
- func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) {
- cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:]
- if p.shell {
- if runtime.GOOS != "windows" {
- cmdSlice = append([]string{"/bin/sh", "-c"}, cmdSlice...)
- } else {
- cmdSlice = append([]string{"cmd", "/S", "/C"}, cmdSlice...)
- }
- }
- entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
- execConfig := exec.NewConfig()
- execConfig.OpenStdin = false
- execConfig.OpenStdout = true
- execConfig.OpenStderr = true
- execConfig.ContainerID = container.ID
- execConfig.DetachKeys = []byte{}
- execConfig.Entrypoint = entrypoint
- execConfig.Args = args
- execConfig.Tty = false
- execConfig.Privileged = false
- execConfig.User = container.Config.User
- d.registerExecCommand(container, execConfig)
- d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
- output := &limitedBuffer{}
- err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
- if err != nil {
- return nil, err
- }
- info, err := d.getExecConfig(execConfig.ID)
- if err != nil {
- return nil, err
- }
- if info.ExitCode == nil {
- return nil, fmt.Errorf("Healthcheck has no exit code!")
- }
- // Note: Go's json package will handle invalid UTF-8 for us
- out := output.String()
- return &types.HealthcheckResult{
- End: time.Now(),
- ExitCode: *info.ExitCode,
- Output: out,
- }, nil
- }
- // Update the container's Status.Health struct based on the latest probe's result.
- func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) {
- c.Lock()
- defer c.Unlock()
- retries := c.Config.Healthcheck.Retries
- if retries <= 0 {
- retries = defaultProbeRetries
- }
- h := c.State.Health
- oldStatus := h.Status
- if len(h.Log) >= maxLogEntries {
- h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
- } else {
- h.Log = append(h.Log, result)
- }
- if result.ExitCode == exitStatusHealthy {
- h.FailingStreak = 0
- h.Status = types.Healthy
- } else {
- // Failure (including invalid exit code)
- h.FailingStreak++
- if h.FailingStreak >= retries {
- h.Status = types.Unhealthy
- }
- // Else we're starting or healthy. Stay in that state.
- }
- if oldStatus != h.Status {
- d.LogContainerEvent(c, "health_status: "+h.Status)
- }
- }
- // Run the container's monitoring thread until notified via "stop".
- // There is never more than one monitor thread running per container at a time.
- func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
- probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
- probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
- for {
- select {
- case <-stop:
- logrus.Debug("Stop healthcheck monitoring (received while idle)")
- return
- case <-time.After(probeInterval):
- logrus.Debug("Running health check...")
- startTime := time.Now()
- ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
- results := make(chan *types.HealthcheckResult)
- go func() {
- result, err := probe.run(ctx, d, c)
- if err != nil {
- logrus.Warnf("Health check error: %v", err)
- results <- &types.HealthcheckResult{
- ExitCode: -1,
- Output: err.Error(),
- Start: startTime,
- End: time.Now(),
- }
- } else {
- result.Start = startTime
- logrus.Debugf("Health check done (exitCode=%d)", result.ExitCode)
- results <- result
- }
- close(results)
- }()
- select {
- case <-stop:
- logrus.Debug("Stop healthcheck monitoring (received while probing)")
- // Stop timeout and kill probe, but don't wait for probe to exit.
- cancelProbe()
- return
- case result := <-results:
- handleProbeResult(d, c, result)
- // Stop timeout
- cancelProbe()
- case <-ctx.Done():
- logrus.Debug("Health check taking too long")
- handleProbeResult(d, c, &types.HealthcheckResult{
- ExitCode: -1,
- Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
- Start: startTime,
- End: time.Now(),
- })
- cancelProbe()
- // Wait for probe to exit (it might take a while to respond to the TERM
- // signal and we don't want dying probes to pile up).
- <-results
- }
- }
- }
- }
- // Get a suitable probe implementation for the container's healthcheck configuration.
- // Nil will be returned if no healthcheck was configured or NONE was set.
- func getProbe(c *container.Container) probe {
- config := c.Config.Healthcheck
- if config == nil || len(config.Test) == 0 {
- return nil
- }
- switch config.Test[0] {
- case "CMD":
- return &cmdProbe{shell: false}
- case "CMD-SHELL":
- return &cmdProbe{shell: true}
- default:
- logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD')", config.Test[0])
- return nil
- }
- }
- // Ensure the health-check monitor is running or not, depending on the current
- // state of the container.
- // Called from monitor.go, with c locked.
- func (d *Daemon) updateHealthMonitor(c *container.Container) {
- h := c.State.Health
- if h == nil {
- return // No healthcheck configured
- }
- probe := getProbe(c)
- wantRunning := c.Running && !c.Paused && probe != nil
- if wantRunning {
- if stop := h.OpenMonitorChannel(); stop != nil {
- go monitor(d, c, stop, probe)
- }
- } else {
- h.CloseMonitorChannel()
- }
- }
- // Reset the health state for a newly-started, restarted or restored container.
- // initHealthMonitor is called from monitor.go and we should never be running
- // two instances at once.
- // Called with c locked.
- func (d *Daemon) initHealthMonitor(c *container.Container) {
- // If no healthcheck is setup then don't init the monitor
- if getProbe(c) == nil {
- return
- }
- // This is needed in case we're auto-restarting
- d.stopHealthchecks(c)
- if c.State.Health == nil {
- h := &container.Health{}
- h.Status = types.Starting
- c.State.Health = h
- }
- d.updateHealthMonitor(c)
- }
- // Called when the container is being stopped (whether because the health check is
- // failing or for any other reason).
- func (d *Daemon) stopHealthchecks(c *container.Container) {
- h := c.State.Health
- if h != nil {
- h.CloseMonitorChannel()
- }
- }
- // Buffer up to maxOutputLen bytes. Further data is discarded.
- type limitedBuffer struct {
- buf bytes.Buffer
- truncated bool // indicates that data has been lost
- }
- // Append to limitedBuffer while there is room.
- func (b *limitedBuffer) Write(data []byte) (int, error) {
- bufLen := b.buf.Len()
- dataLen := len(data)
- keep := min(maxOutputLen-bufLen, dataLen)
- if keep > 0 {
- b.buf.Write(data[:keep])
- }
- if keep < dataLen {
- b.truncated = true
- }
- return dataLen, nil
- }
- // The contents of the buffer, with "..." appended if it overflowed.
- func (b *limitedBuffer) String() string {
- out := b.buf.String()
- if b.truncated {
- out = out + "..."
- }
- return out
- }
- // If configuredValue is zero, use defaultValue instead.
- func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
- if configuredValue == 0 {
- return defaultValue
- }
- return configuredValue
- }
- func min(x, y int) int {
- if x < y {
- return x
- }
- return y
- }
|