health.go 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. package daemon
  2. import (
  3. "bytes"
  4. "fmt"
  5. "runtime"
  6. "strings"
  7. "sync"
  8. "time"
  9. "golang.org/x/net/context"
  10. "github.com/Sirupsen/logrus"
  11. "github.com/docker/docker/api/types"
  12. containertypes "github.com/docker/docker/api/types/container"
  13. "github.com/docker/docker/api/types/strslice"
  14. "github.com/docker/docker/container"
  15. "github.com/docker/docker/daemon/exec"
  16. )
  17. const (
  18. // Longest healthcheck probe output message to store. Longer messages will be truncated.
  19. maxOutputLen = 4096
  20. // Default interval between probe runs (from the end of the first to the start of the second).
  21. // Also the time before the first probe.
  22. defaultProbeInterval = 30 * time.Second
  23. // The maximum length of time a single probe run should take. If the probe takes longer
  24. // than this, the check is considered to have failed.
  25. defaultProbeTimeout = 30 * time.Second
  26. // Default number of consecutive failures of the health check
  27. // for the container to be considered unhealthy.
  28. defaultProbeRetries = 3
  29. // Maximum number of entries to record
  30. maxLogEntries = 5
  31. )
  32. const (
  33. // Exit status codes that can be returned by the probe command.
  34. exitStatusHealthy = 0 // Container is healthy
  35. exitStatusUnhealthy = 1 // Container is unhealthy
  36. )
  37. // probe implementations know how to run a particular type of probe.
  38. type probe interface {
  39. // Perform one run of the check. Returns the exit code and an optional
  40. // short diagnostic string.
  41. run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
  42. }
  43. // cmdProbe implements the "CMD" probe type.
  44. type cmdProbe struct {
  45. // Run the command with the system's default shell instead of execing it directly.
  46. shell bool
  47. }
  48. // exec the healthcheck command in the container.
  49. // Returns the exit code and probe output (if any)
  50. func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) {
  51. cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:]
  52. if p.shell {
  53. cmdSlice = append(getShell(container.Config), cmdSlice...)
  54. }
  55. entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
  56. execConfig := exec.NewConfig()
  57. execConfig.OpenStdin = false
  58. execConfig.OpenStdout = true
  59. execConfig.OpenStderr = true
  60. execConfig.ContainerID = container.ID
  61. execConfig.DetachKeys = []byte{}
  62. execConfig.Entrypoint = entrypoint
  63. execConfig.Args = args
  64. execConfig.Tty = false
  65. execConfig.Privileged = false
  66. execConfig.User = container.Config.User
  67. d.registerExecCommand(container, execConfig)
  68. d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
  69. output := &limitedBuffer{}
  70. err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
  71. if err != nil {
  72. return nil, err
  73. }
  74. info, err := d.getExecConfig(execConfig.ID)
  75. if err != nil {
  76. return nil, err
  77. }
  78. if info.ExitCode == nil {
  79. return nil, fmt.Errorf("Healthcheck for container %s has no exit code!", container.ID)
  80. }
  81. // Note: Go's json package will handle invalid UTF-8 for us
  82. out := output.String()
  83. return &types.HealthcheckResult{
  84. End: time.Now(),
  85. ExitCode: *info.ExitCode,
  86. Output: out,
  87. }, nil
  88. }
  89. // Update the container's Status.Health struct based on the latest probe's result.
  90. func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) {
  91. c.Lock()
  92. defer c.Unlock()
  93. // probe may have been cancelled while waiting on lock. Ignore result then
  94. select {
  95. case <-done:
  96. return
  97. default:
  98. }
  99. retries := c.Config.Healthcheck.Retries
  100. if retries <= 0 {
  101. retries = defaultProbeRetries
  102. }
  103. h := c.State.Health
  104. oldStatus := h.Status
  105. if len(h.Log) >= maxLogEntries {
  106. h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
  107. } else {
  108. h.Log = append(h.Log, result)
  109. }
  110. if result.ExitCode == exitStatusHealthy {
  111. h.FailingStreak = 0
  112. h.Status = types.Healthy
  113. } else {
  114. // Failure (including invalid exit code)
  115. h.FailingStreak++
  116. if h.FailingStreak >= retries {
  117. h.Status = types.Unhealthy
  118. }
  119. // Else we're starting or healthy. Stay in that state.
  120. }
  121. if oldStatus != h.Status {
  122. d.LogContainerEvent(c, "health_status: "+h.Status)
  123. }
  124. }
  125. // Run the container's monitoring thread until notified via "stop".
  126. // There is never more than one monitor thread running per container at a time.
  127. func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
  128. probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
  129. probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
  130. for {
  131. select {
  132. case <-stop:
  133. logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
  134. return
  135. case <-time.After(probeInterval):
  136. logrus.Debugf("Running health check for container %s ...", c.ID)
  137. startTime := time.Now()
  138. ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
  139. results := make(chan *types.HealthcheckResult)
  140. go func() {
  141. healthChecksCounter.Inc()
  142. result, err := probe.run(ctx, d, c)
  143. if err != nil {
  144. healthChecksFailedCounter.Inc()
  145. logrus.Warnf("Health check for container %s error: %v", c.ID, err)
  146. results <- &types.HealthcheckResult{
  147. ExitCode: -1,
  148. Output: err.Error(),
  149. Start: startTime,
  150. End: time.Now(),
  151. }
  152. } else {
  153. result.Start = startTime
  154. logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode)
  155. results <- result
  156. }
  157. close(results)
  158. }()
  159. select {
  160. case <-stop:
  161. logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID)
  162. // Stop timeout and kill probe, but don't wait for probe to exit.
  163. cancelProbe()
  164. return
  165. case result := <-results:
  166. handleProbeResult(d, c, result, stop)
  167. // Stop timeout
  168. cancelProbe()
  169. case <-ctx.Done():
  170. logrus.Debugf("Health check for container %s taking too long", c.ID)
  171. handleProbeResult(d, c, &types.HealthcheckResult{
  172. ExitCode: -1,
  173. Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
  174. Start: startTime,
  175. End: time.Now(),
  176. }, stop)
  177. cancelProbe()
  178. // Wait for probe to exit (it might take a while to respond to the TERM
  179. // signal and we don't want dying probes to pile up).
  180. <-results
  181. }
  182. }
  183. }
  184. }
  185. // Get a suitable probe implementation for the container's healthcheck configuration.
  186. // Nil will be returned if no healthcheck was configured or NONE was set.
  187. func getProbe(c *container.Container) probe {
  188. config := c.Config.Healthcheck
  189. if config == nil || len(config.Test) == 0 {
  190. return nil
  191. }
  192. switch config.Test[0] {
  193. case "CMD":
  194. return &cmdProbe{shell: false}
  195. case "CMD-SHELL":
  196. return &cmdProbe{shell: true}
  197. default:
  198. logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID)
  199. return nil
  200. }
  201. }
  202. // Ensure the health-check monitor is running or not, depending on the current
  203. // state of the container.
  204. // Called from monitor.go, with c locked.
  205. func (d *Daemon) updateHealthMonitor(c *container.Container) {
  206. h := c.State.Health
  207. if h == nil {
  208. return // No healthcheck configured
  209. }
  210. probe := getProbe(c)
  211. wantRunning := c.Running && !c.Paused && probe != nil
  212. if wantRunning {
  213. if stop := h.OpenMonitorChannel(); stop != nil {
  214. go monitor(d, c, stop, probe)
  215. }
  216. } else {
  217. h.CloseMonitorChannel()
  218. }
  219. }
  220. // Reset the health state for a newly-started, restarted or restored container.
  221. // initHealthMonitor is called from monitor.go and we should never be running
  222. // two instances at once.
  223. // Called with c locked.
  224. func (d *Daemon) initHealthMonitor(c *container.Container) {
  225. // If no healthcheck is setup then don't init the monitor
  226. if getProbe(c) == nil {
  227. return
  228. }
  229. // This is needed in case we're auto-restarting
  230. d.stopHealthchecks(c)
  231. if h := c.State.Health; h != nil {
  232. h.Status = types.Starting
  233. h.FailingStreak = 0
  234. } else {
  235. h := &container.Health{}
  236. h.Status = types.Starting
  237. c.State.Health = h
  238. }
  239. d.updateHealthMonitor(c)
  240. }
  241. // Called when the container is being stopped (whether because the health check is
  242. // failing or for any other reason).
  243. func (d *Daemon) stopHealthchecks(c *container.Container) {
  244. h := c.State.Health
  245. if h != nil {
  246. h.CloseMonitorChannel()
  247. }
  248. }
  249. // Buffer up to maxOutputLen bytes. Further data is discarded.
  250. type limitedBuffer struct {
  251. buf bytes.Buffer
  252. mu sync.Mutex
  253. truncated bool // indicates that data has been lost
  254. }
  255. // Append to limitedBuffer while there is room.
  256. func (b *limitedBuffer) Write(data []byte) (int, error) {
  257. b.mu.Lock()
  258. defer b.mu.Unlock()
  259. bufLen := b.buf.Len()
  260. dataLen := len(data)
  261. keep := min(maxOutputLen-bufLen, dataLen)
  262. if keep > 0 {
  263. b.buf.Write(data[:keep])
  264. }
  265. if keep < dataLen {
  266. b.truncated = true
  267. }
  268. return dataLen, nil
  269. }
  270. // The contents of the buffer, with "..." appended if it overflowed.
  271. func (b *limitedBuffer) String() string {
  272. b.mu.Lock()
  273. defer b.mu.Unlock()
  274. out := b.buf.String()
  275. if b.truncated {
  276. out = out + "..."
  277. }
  278. return out
  279. }
  280. // If configuredValue is zero, use defaultValue instead.
  281. func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
  282. if configuredValue == 0 {
  283. return defaultValue
  284. }
  285. return configuredValue
  286. }
  287. func min(x, y int) int {
  288. if x < y {
  289. return x
  290. }
  291. return y
  292. }
  293. func getShell(config *containertypes.Config) []string {
  294. if len(config.Shell) != 0 {
  295. return config.Shell
  296. }
  297. if runtime.GOOS != "windows" {
  298. return []string{"/bin/sh", "-c"}
  299. }
  300. return []string{"cmd", "/S", "/C"}
  301. }