health.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. package daemon
  2. import (
  3. "bytes"
  4. "fmt"
  5. "runtime"
  6. "strings"
  7. "sync"
  8. "time"
  9. "golang.org/x/net/context"
  10. "github.com/docker/docker/api/types"
  11. containertypes "github.com/docker/docker/api/types/container"
  12. "github.com/docker/docker/api/types/strslice"
  13. "github.com/docker/docker/container"
  14. "github.com/docker/docker/daemon/exec"
  15. "github.com/sirupsen/logrus"
  16. )
  17. const (
  18. // Longest healthcheck probe output message to store. Longer messages will be truncated.
  19. maxOutputLen = 4096
  20. // Default interval between probe runs (from the end of the first to the start of the second).
  21. // Also the time before the first probe.
  22. defaultProbeInterval = 30 * time.Second
  23. // The maximum length of time a single probe run should take. If the probe takes longer
  24. // than this, the check is considered to have failed.
  25. defaultProbeTimeout = 30 * time.Second
  26. // The time given for the container to start before the health check starts considering
  27. // the container unstable. Defaults to none.
  28. defaultStartPeriod = 0 * time.Second
  29. // Default number of consecutive failures of the health check
  30. // for the container to be considered unhealthy.
  31. defaultProbeRetries = 3
  32. // Maximum number of entries to record
  33. maxLogEntries = 5
  34. )
  35. const (
  36. // Exit status codes that can be returned by the probe command.
  37. exitStatusHealthy = 0 // Container is healthy
  38. )
  39. // probe implementations know how to run a particular type of probe.
  40. type probe interface {
  41. // Perform one run of the check. Returns the exit code and an optional
  42. // short diagnostic string.
  43. run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
  44. }
  45. // cmdProbe implements the "CMD" probe type.
  46. type cmdProbe struct {
  47. // Run the command with the system's default shell instead of execing it directly.
  48. shell bool
  49. }
  50. // exec the healthcheck command in the container.
  51. // Returns the exit code and probe output (if any)
  52. func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) {
  53. cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:]
  54. if p.shell {
  55. cmdSlice = append(getShell(cntr.Config), cmdSlice...)
  56. }
  57. entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
  58. execConfig := exec.NewConfig()
  59. execConfig.OpenStdin = false
  60. execConfig.OpenStdout = true
  61. execConfig.OpenStderr = true
  62. execConfig.ContainerID = cntr.ID
  63. execConfig.DetachKeys = []byte{}
  64. execConfig.Entrypoint = entrypoint
  65. execConfig.Args = args
  66. execConfig.Tty = false
  67. execConfig.Privileged = false
  68. execConfig.User = cntr.Config.User
  69. execConfig.WorkingDir = cntr.Config.WorkingDir
  70. linkedEnv, err := d.setupLinkedContainers(cntr)
  71. if err != nil {
  72. return nil, err
  73. }
  74. execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env)
  75. d.registerExecCommand(cntr, execConfig)
  76. d.LogContainerEvent(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
  77. output := &limitedBuffer{}
  78. err = d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
  79. if err != nil {
  80. return nil, err
  81. }
  82. info, err := d.getExecConfig(execConfig.ID)
  83. if err != nil {
  84. return nil, err
  85. }
  86. if info.ExitCode == nil {
  87. return nil, fmt.Errorf("healthcheck for container %s has no exit code", cntr.ID)
  88. }
  89. // Note: Go's json package will handle invalid UTF-8 for us
  90. out := output.String()
  91. return &types.HealthcheckResult{
  92. End: time.Now(),
  93. ExitCode: *info.ExitCode,
  94. Output: out,
  95. }, nil
  96. }
  97. // Update the container's Status.Health struct based on the latest probe's result.
  98. func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) {
  99. c.Lock()
  100. defer c.Unlock()
  101. // probe may have been cancelled while waiting on lock. Ignore result then
  102. select {
  103. case <-done:
  104. return
  105. default:
  106. }
  107. retries := c.Config.Healthcheck.Retries
  108. if retries <= 0 {
  109. retries = defaultProbeRetries
  110. }
  111. h := c.State.Health
  112. oldStatus := h.Status()
  113. if len(h.Log) >= maxLogEntries {
  114. h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
  115. } else {
  116. h.Log = append(h.Log, result)
  117. }
  118. if result.ExitCode == exitStatusHealthy {
  119. h.FailingStreak = 0
  120. h.SetStatus(types.Healthy)
  121. } else { // Failure (including invalid exit code)
  122. shouldIncrementStreak := true
  123. // If the container is starting (i.e. we never had a successful health check)
  124. // then we check if we are within the start period of the container in which
  125. // case we do not increment the failure streak.
  126. if h.Status() == types.Starting {
  127. startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod)
  128. timeSinceStart := result.Start.Sub(c.State.StartedAt)
  129. // If still within the start period, then don't increment failing streak.
  130. if timeSinceStart < startPeriod {
  131. shouldIncrementStreak = false
  132. }
  133. }
  134. if shouldIncrementStreak {
  135. h.FailingStreak++
  136. if h.FailingStreak >= retries {
  137. h.SetStatus(types.Unhealthy)
  138. }
  139. }
  140. // Else we're starting or healthy. Stay in that state.
  141. }
  142. // replicate Health status changes
  143. if err := c.CheckpointTo(d.containersReplica); err != nil {
  144. // queries will be inconsistent until the next probe runs or other state mutations
  145. // checkpoint the container
  146. logrus.Errorf("Error replicating health state for container %s: %v", c.ID, err)
  147. }
  148. current := h.Status()
  149. if oldStatus != current {
  150. d.LogContainerEvent(c, "health_status: "+current)
  151. }
  152. }
  153. // Run the container's monitoring thread until notified via "stop".
  154. // There is never more than one monitor thread running per container at a time.
  155. func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
  156. probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
  157. probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
  158. for {
  159. select {
  160. case <-stop:
  161. logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
  162. return
  163. case <-time.After(probeInterval):
  164. logrus.Debugf("Running health check for container %s ...", c.ID)
  165. startTime := time.Now()
  166. ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
  167. results := make(chan *types.HealthcheckResult, 1)
  168. go func() {
  169. healthChecksCounter.Inc()
  170. result, err := probe.run(ctx, d, c)
  171. if err != nil {
  172. healthChecksFailedCounter.Inc()
  173. logrus.Warnf("Health check for container %s error: %v", c.ID, err)
  174. results <- &types.HealthcheckResult{
  175. ExitCode: -1,
  176. Output: err.Error(),
  177. Start: startTime,
  178. End: time.Now(),
  179. }
  180. } else {
  181. result.Start = startTime
  182. logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode)
  183. results <- result
  184. }
  185. close(results)
  186. }()
  187. select {
  188. case <-stop:
  189. logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID)
  190. cancelProbe()
  191. // Wait for probe to exit (it might take a while to respond to the TERM
  192. // signal and we don't want dying probes to pile up).
  193. <-results
  194. return
  195. case result := <-results:
  196. handleProbeResult(d, c, result, stop)
  197. // Stop timeout
  198. cancelProbe()
  199. case <-ctx.Done():
  200. logrus.Debugf("Health check for container %s taking too long", c.ID)
  201. handleProbeResult(d, c, &types.HealthcheckResult{
  202. ExitCode: -1,
  203. Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
  204. Start: startTime,
  205. End: time.Now(),
  206. }, stop)
  207. cancelProbe()
  208. // Wait for probe to exit (it might take a while to respond to the TERM
  209. // signal and we don't want dying probes to pile up).
  210. <-results
  211. }
  212. }
  213. }
  214. }
  215. // Get a suitable probe implementation for the container's healthcheck configuration.
  216. // Nil will be returned if no healthcheck was configured or NONE was set.
  217. func getProbe(c *container.Container) probe {
  218. config := c.Config.Healthcheck
  219. if config == nil || len(config.Test) == 0 {
  220. return nil
  221. }
  222. switch config.Test[0] {
  223. case "CMD":
  224. return &cmdProbe{shell: false}
  225. case "CMD-SHELL":
  226. return &cmdProbe{shell: true}
  227. case "NONE":
  228. return nil
  229. default:
  230. logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID)
  231. return nil
  232. }
  233. }
  234. // Ensure the health-check monitor is running or not, depending on the current
  235. // state of the container.
  236. // Called from monitor.go, with c locked.
  237. func (d *Daemon) updateHealthMonitor(c *container.Container) {
  238. h := c.State.Health
  239. if h == nil {
  240. return // No healthcheck configured
  241. }
  242. probe := getProbe(c)
  243. wantRunning := c.Running && !c.Paused && probe != nil
  244. if wantRunning {
  245. if stop := h.OpenMonitorChannel(); stop != nil {
  246. go monitor(d, c, stop, probe)
  247. }
  248. } else {
  249. h.CloseMonitorChannel()
  250. }
  251. }
  252. // Reset the health state for a newly-started, restarted or restored container.
  253. // initHealthMonitor is called from monitor.go and we should never be running
  254. // two instances at once.
  255. // Called with c locked.
  256. func (d *Daemon) initHealthMonitor(c *container.Container) {
  257. // If no healthcheck is setup then don't init the monitor
  258. if getProbe(c) == nil {
  259. return
  260. }
  261. // This is needed in case we're auto-restarting
  262. d.stopHealthchecks(c)
  263. if h := c.State.Health; h != nil {
  264. h.SetStatus(types.Starting)
  265. h.FailingStreak = 0
  266. } else {
  267. h := &container.Health{}
  268. h.SetStatus(types.Starting)
  269. c.State.Health = h
  270. }
  271. d.updateHealthMonitor(c)
  272. }
  273. // Called when the container is being stopped (whether because the health check is
  274. // failing or for any other reason).
  275. func (d *Daemon) stopHealthchecks(c *container.Container) {
  276. h := c.State.Health
  277. if h != nil {
  278. h.CloseMonitorChannel()
  279. }
  280. }
  281. // Buffer up to maxOutputLen bytes. Further data is discarded.
  282. type limitedBuffer struct {
  283. buf bytes.Buffer
  284. mu sync.Mutex
  285. truncated bool // indicates that data has been lost
  286. }
  287. // Append to limitedBuffer while there is room.
  288. func (b *limitedBuffer) Write(data []byte) (int, error) {
  289. b.mu.Lock()
  290. defer b.mu.Unlock()
  291. bufLen := b.buf.Len()
  292. dataLen := len(data)
  293. keep := min(maxOutputLen-bufLen, dataLen)
  294. if keep > 0 {
  295. b.buf.Write(data[:keep])
  296. }
  297. if keep < dataLen {
  298. b.truncated = true
  299. }
  300. return dataLen, nil
  301. }
  302. // The contents of the buffer, with "..." appended if it overflowed.
  303. func (b *limitedBuffer) String() string {
  304. b.mu.Lock()
  305. defer b.mu.Unlock()
  306. out := b.buf.String()
  307. if b.truncated {
  308. out = out + "..."
  309. }
  310. return out
  311. }
  312. // If configuredValue is zero, use defaultValue instead.
  313. func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
  314. if configuredValue == 0 {
  315. return defaultValue
  316. }
  317. return configuredValue
  318. }
  319. func min(x, y int) int {
  320. if x < y {
  321. return x
  322. }
  323. return y
  324. }
  325. func getShell(config *containertypes.Config) []string {
  326. if len(config.Shell) != 0 {
  327. return config.Shell
  328. }
  329. if runtime.GOOS != "windows" {
  330. return []string{"/bin/sh", "-c"}
  331. }
  332. return []string{"cmd", "/S", "/C"}
  333. }