health.go 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. package daemon
  2. import (
  3. "bytes"
  4. "fmt"
  5. "runtime"
  6. "strings"
  7. "time"
  8. "golang.org/x/net/context"
  9. "github.com/Sirupsen/logrus"
  10. "github.com/docker/docker/container"
  11. "github.com/docker/docker/daemon/exec"
  12. "github.com/docker/engine-api/types"
  13. "github.com/docker/engine-api/types/strslice"
  14. )
  15. const (
  16. // Longest healthcheck probe output message to store. Longer messages will be truncated.
  17. maxOutputLen = 4096
  18. // Default interval between probe runs (from the end of the first to the start of the second).
  19. // Also the time before the first probe.
  20. defaultProbeInterval = 30 * time.Second
  21. // The maximum length of time a single probe run should take. If the probe takes longer
  22. // than this, the check is considered to have failed.
  23. defaultProbeTimeout = 30 * time.Second
  24. // Default number of consecutive failures of the health check
  25. // for the container to be considered unhealthy.
  26. defaultProbeRetries = 3
  27. // Maximum number of entries to record
  28. maxLogEntries = 5
  29. )
  30. const (
  31. // Exit status codes that can be returned by the probe command.
  32. exitStatusHealthy = 0 // Container is healthy
  33. exitStatusUnhealthy = 1 // Container is unhealthy
  34. )
  35. // probe implementations know how to run a particular type of probe.
  36. type probe interface {
  37. // Perform one run of the check. Returns the exit code and an optional
  38. // short diagnostic string.
  39. run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
  40. }
  41. // cmdProbe implements the "CMD" probe type.
  42. type cmdProbe struct {
  43. // Run the command with the system's default shell instead of execing it directly.
  44. shell bool
  45. }
  46. // exec the healthcheck command in the container.
  47. // Returns the exit code and probe output (if any)
  48. func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) {
  49. cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:]
  50. if p.shell {
  51. if runtime.GOOS != "windows" {
  52. cmdSlice = append([]string{"/bin/sh", "-c"}, cmdSlice...)
  53. } else {
  54. cmdSlice = append([]string{"cmd", "/S", "/C"}, cmdSlice...)
  55. }
  56. }
  57. entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
  58. execConfig := exec.NewConfig()
  59. execConfig.OpenStdin = false
  60. execConfig.OpenStdout = true
  61. execConfig.OpenStderr = true
  62. execConfig.ContainerID = container.ID
  63. execConfig.DetachKeys = []byte{}
  64. execConfig.Entrypoint = entrypoint
  65. execConfig.Args = args
  66. execConfig.Tty = false
  67. execConfig.Privileged = false
  68. execConfig.User = container.Config.User
  69. d.registerExecCommand(container, execConfig)
  70. d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
  71. output := &limitedBuffer{}
  72. err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
  73. if err != nil {
  74. return nil, err
  75. }
  76. info, err := d.getExecConfig(execConfig.ID)
  77. if err != nil {
  78. return nil, err
  79. }
  80. if info.ExitCode == nil {
  81. return nil, fmt.Errorf("Healthcheck has no exit code!")
  82. }
  83. // Note: Go's json package will handle invalid UTF-8 for us
  84. out := output.String()
  85. return &types.HealthcheckResult{
  86. End: time.Now(),
  87. ExitCode: *info.ExitCode,
  88. Output: out,
  89. }, nil
  90. }
  91. // Update the container's Status.Health struct based on the latest probe's result.
  92. func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) {
  93. c.Lock()
  94. defer c.Unlock()
  95. retries := c.Config.Healthcheck.Retries
  96. if retries <= 0 {
  97. retries = defaultProbeRetries
  98. }
  99. h := c.State.Health
  100. oldStatus := h.Status
  101. if len(h.Log) >= maxLogEntries {
  102. h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
  103. } else {
  104. h.Log = append(h.Log, result)
  105. }
  106. if result.ExitCode == exitStatusHealthy {
  107. h.FailingStreak = 0
  108. h.Status = types.Healthy
  109. } else {
  110. // Failure (including invalid exit code)
  111. h.FailingStreak++
  112. if h.FailingStreak >= retries {
  113. h.Status = types.Unhealthy
  114. }
  115. // Else we're starting or healthy. Stay in that state.
  116. }
  117. if oldStatus != h.Status {
  118. d.LogContainerEvent(c, "health_status: "+h.Status)
  119. }
  120. }
  121. // Run the container's monitoring thread until notified via "stop".
  122. // There is never more than one monitor thread running per container at a time.
  123. func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
  124. probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
  125. probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
  126. for {
  127. select {
  128. case <-stop:
  129. logrus.Debug("Stop healthcheck monitoring (received while idle)")
  130. return
  131. case <-time.After(probeInterval):
  132. logrus.Debug("Running health check...")
  133. startTime := time.Now()
  134. ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
  135. results := make(chan *types.HealthcheckResult)
  136. go func() {
  137. result, err := probe.run(ctx, d, c)
  138. if err != nil {
  139. logrus.Warnf("Health check error: %v", err)
  140. results <- &types.HealthcheckResult{
  141. ExitCode: -1,
  142. Output: err.Error(),
  143. Start: startTime,
  144. End: time.Now(),
  145. }
  146. } else {
  147. result.Start = startTime
  148. logrus.Debugf("Health check done (exitCode=%d)", result.ExitCode)
  149. results <- result
  150. }
  151. close(results)
  152. }()
  153. select {
  154. case <-stop:
  155. logrus.Debug("Stop healthcheck monitoring (received while probing)")
  156. // Stop timeout and kill probe, but don't wait for probe to exit.
  157. cancelProbe()
  158. return
  159. case result := <-results:
  160. handleProbeResult(d, c, result)
  161. // Stop timeout
  162. cancelProbe()
  163. case <-ctx.Done():
  164. logrus.Debug("Health check taking too long")
  165. handleProbeResult(d, c, &types.HealthcheckResult{
  166. ExitCode: -1,
  167. Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
  168. Start: startTime,
  169. End: time.Now(),
  170. })
  171. cancelProbe()
  172. // Wait for probe to exit (it might take a while to respond to the TERM
  173. // signal and we don't want dying probes to pile up).
  174. <-results
  175. }
  176. }
  177. }
  178. }
  179. // Get a suitable probe implementation for the container's healthcheck configuration.
  180. // Nil will be returned if no healthcheck was configured or NONE was set.
  181. func getProbe(c *container.Container) probe {
  182. config := c.Config.Healthcheck
  183. if config == nil || len(config.Test) == 0 {
  184. return nil
  185. }
  186. switch config.Test[0] {
  187. case "CMD":
  188. return &cmdProbe{shell: false}
  189. case "CMD-SHELL":
  190. return &cmdProbe{shell: true}
  191. default:
  192. logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD')", config.Test[0])
  193. return nil
  194. }
  195. }
  196. // Ensure the health-check monitor is running or not, depending on the current
  197. // state of the container.
  198. // Called from monitor.go, with c locked.
  199. func (d *Daemon) updateHealthMonitor(c *container.Container) {
  200. h := c.State.Health
  201. if h == nil {
  202. return // No healthcheck configured
  203. }
  204. probe := getProbe(c)
  205. wantRunning := c.Running && !c.Paused && probe != nil
  206. if wantRunning {
  207. if stop := h.OpenMonitorChannel(); stop != nil {
  208. go monitor(d, c, stop, probe)
  209. }
  210. } else {
  211. h.CloseMonitorChannel()
  212. }
  213. }
  214. // Reset the health state for a newly-started, restarted or restored container.
  215. // initHealthMonitor is called from monitor.go and we should never be running
  216. // two instances at once.
  217. // Called with c locked.
  218. func (d *Daemon) initHealthMonitor(c *container.Container) {
  219. // If no healthcheck is setup then don't init the monitor
  220. if getProbe(c) == nil {
  221. return
  222. }
  223. // This is needed in case we're auto-restarting
  224. d.stopHealthchecks(c)
  225. if c.State.Health == nil {
  226. h := &container.Health{}
  227. h.Status = types.Starting
  228. c.State.Health = h
  229. }
  230. d.updateHealthMonitor(c)
  231. }
  232. // Called when the container is being stopped (whether because the health check is
  233. // failing or for any other reason).
  234. func (d *Daemon) stopHealthchecks(c *container.Container) {
  235. h := c.State.Health
  236. if h != nil {
  237. h.CloseMonitorChannel()
  238. }
  239. }
  240. // Buffer up to maxOutputLen bytes. Further data is discarded.
  241. type limitedBuffer struct {
  242. buf bytes.Buffer
  243. truncated bool // indicates that data has been lost
  244. }
  245. // Append to limitedBuffer while there is room.
  246. func (b *limitedBuffer) Write(data []byte) (int, error) {
  247. bufLen := b.buf.Len()
  248. dataLen := len(data)
  249. keep := min(maxOutputLen-bufLen, dataLen)
  250. if keep > 0 {
  251. b.buf.Write(data[:keep])
  252. }
  253. if keep < dataLen {
  254. b.truncated = true
  255. }
  256. return dataLen, nil
  257. }
  258. // The contents of the buffer, with "..." appended if it overflowed.
  259. func (b *limitedBuffer) String() string {
  260. out := b.buf.String()
  261. if b.truncated {
  262. out = out + "..."
  263. }
  264. return out
  265. }
  266. // If configuredValue is zero, use defaultValue instead.
  267. func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
  268. if configuredValue == 0 {
  269. return defaultValue
  270. }
  271. return configuredValue
  272. }
  273. func min(x, y int) int {
  274. if x < y {
  275. return x
  276. }
  277. return y
  278. }