health.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457
  1. package daemon // import "github.com/docker/docker/daemon"
  2. import (
  3. "bytes"
  4. "context"
  5. "fmt"
  6. "runtime"
  7. "strings"
  8. "sync"
  9. "time"
  10. "github.com/containerd/log"
  11. "github.com/docker/docker/api/types"
  12. containertypes "github.com/docker/docker/api/types/container"
  13. "github.com/docker/docker/api/types/events"
  14. "github.com/docker/docker/api/types/strslice"
  15. "github.com/docker/docker/container"
  16. )
  17. const (
  18. // Longest healthcheck probe output message to store. Longer messages will be truncated.
  19. maxOutputLen = 4096
  20. // Default interval between probe runs (from the end of the first to the start of the second).
  21. // Also the time before the first probe.
  22. defaultProbeInterval = 30 * time.Second
  23. // The maximum length of time a single probe run should take. If the probe takes longer
  24. // than this, the check is considered to have failed.
  25. defaultProbeTimeout = 30 * time.Second
  26. // The time given for the container to start before the health check starts considering
  27. // the container unstable. Defaults to none.
  28. defaultStartPeriod = 0 * time.Second
  29. // Default number of consecutive failures of the health check
  30. // for the container to be considered unhealthy.
  31. defaultProbeRetries = 3
  32. // Maximum number of entries to record
  33. maxLogEntries = 5
  34. )
  35. const (
  36. // Exit status codes that can be returned by the probe command.
  37. exitStatusHealthy = 0 // Container is healthy
  38. )
  39. // probe implementations know how to run a particular type of probe.
  40. type probe interface {
  41. // Perform one run of the check. Returns the exit code and an optional
  42. // short diagnostic string.
  43. run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
  44. }
  45. // cmdProbe implements the "CMD" probe type.
  46. type cmdProbe struct {
  47. // Run the command with the system's default shell instead of execing it directly.
  48. shell bool
  49. }
  50. // exec the healthcheck command in the container.
  51. // Returns the exit code and probe output (if any)
  52. func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) {
  53. startTime := time.Now()
  54. cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:]
  55. if p.shell {
  56. cmdSlice = append(getShell(cntr), cmdSlice...)
  57. }
  58. entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
  59. execConfig := container.NewExecConfig(cntr)
  60. execConfig.OpenStdin = false
  61. execConfig.OpenStdout = true
  62. execConfig.OpenStderr = true
  63. execConfig.DetachKeys = []byte{}
  64. execConfig.Entrypoint = entrypoint
  65. execConfig.Args = args
  66. execConfig.Tty = false
  67. execConfig.Privileged = false
  68. execConfig.User = cntr.Config.User
  69. execConfig.WorkingDir = cntr.Config.WorkingDir
  70. linkedEnv, err := d.setupLinkedContainers(cntr)
  71. if err != nil {
  72. return nil, err
  73. }
  74. execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env)
  75. d.registerExecCommand(cntr, execConfig)
  76. d.LogContainerEventWithAttributes(cntr, events.Action(string(events.ActionExecCreate)+": "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " ")), map[string]string{
  77. "execID": execConfig.ID,
  78. })
  79. output := &limitedBuffer{}
  80. probeCtx, cancelProbe := context.WithCancel(ctx)
  81. defer cancelProbe()
  82. execErr := make(chan error, 1)
  83. options := containertypes.ExecStartOptions{
  84. Stdout: output,
  85. Stderr: output,
  86. }
  87. go func() { execErr <- d.ContainerExecStart(probeCtx, execConfig.ID, options) }()
  88. // Starting an exec can take a significant amount of time: on the order
  89. // of 1s in extreme cases. The time it takes dockerd and containerd to
  90. // start the exec is time that the probe process is not running, and so
  91. // should not count towards the health check's timeout. Apply a separate
  92. // timeout to abort if the exec request is wedged.
  93. tm := time.NewTimer(30 * time.Second)
  94. defer tm.Stop()
  95. select {
  96. case <-tm.C:
  97. return nil, fmt.Errorf("timed out starting health check for container %s", cntr.ID)
  98. case err := <-execErr:
  99. if err != nil {
  100. return nil, err
  101. }
  102. case <-execConfig.Started:
  103. healthCheckStartDuration.UpdateSince(startTime)
  104. }
  105. if !tm.Stop() {
  106. <-tm.C
  107. }
  108. probeTimeout := timeoutWithDefault(cntr.Config.Healthcheck.Timeout, defaultProbeTimeout)
  109. tm.Reset(probeTimeout)
  110. select {
  111. case <-tm.C:
  112. cancelProbe()
  113. log.G(ctx).WithContext(ctx).Debugf("Health check for container %s taking too long", cntr.ID)
  114. // Wait for probe to exit (it might take some time to call containerd to kill
  115. // the process and we don't want dying probes to pile up).
  116. <-execErr
  117. var msg string
  118. if out := output.String(); len(out) > 0 {
  119. msg = fmt.Sprintf("Health check exceeded timeout (%v): %s", probeTimeout, out)
  120. } else {
  121. msg = fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout)
  122. }
  123. return &types.HealthcheckResult{
  124. ExitCode: -1,
  125. Output: msg,
  126. End: time.Now(),
  127. }, nil
  128. case err := <-execErr:
  129. if err != nil {
  130. return nil, err
  131. }
  132. }
  133. info, err := d.getExecConfig(execConfig.ID)
  134. if err != nil {
  135. return nil, err
  136. }
  137. exitCode, err := func() (int, error) {
  138. info.Lock()
  139. defer info.Unlock()
  140. if info.ExitCode == nil {
  141. return 0, fmt.Errorf("healthcheck for container %s has no exit code", cntr.ID)
  142. }
  143. return *info.ExitCode, nil
  144. }()
  145. if err != nil {
  146. return nil, err
  147. }
  148. // Note: Go's json package will handle invalid UTF-8 for us
  149. out := output.String()
  150. return &types.HealthcheckResult{
  151. End: time.Now(),
  152. ExitCode: exitCode,
  153. Output: out,
  154. }, nil
  155. }
  156. // Update the container's Status.Health struct based on the latest probe's result.
  157. func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) {
  158. c.Lock()
  159. defer c.Unlock()
  160. // probe may have been cancelled while waiting on lock. Ignore result then
  161. select {
  162. case <-done:
  163. return
  164. default:
  165. }
  166. retries := c.Config.Healthcheck.Retries
  167. if retries <= 0 {
  168. retries = defaultProbeRetries
  169. }
  170. h := c.State.Health
  171. oldStatus := h.Status()
  172. if len(h.Log) >= maxLogEntries {
  173. h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
  174. } else {
  175. h.Log = append(h.Log, result)
  176. }
  177. if result.ExitCode == exitStatusHealthy {
  178. h.FailingStreak = 0
  179. h.SetStatus(types.Healthy)
  180. } else { // Failure (including invalid exit code)
  181. shouldIncrementStreak := true
  182. // If the container is starting (i.e. we never had a successful health check)
  183. // then we check if we are within the start period of the container in which
  184. // case we do not increment the failure streak.
  185. if h.Status() == types.Starting {
  186. startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod)
  187. timeSinceStart := result.Start.Sub(c.State.StartedAt)
  188. // If still within the start period, then don't increment failing streak.
  189. if timeSinceStart < startPeriod {
  190. shouldIncrementStreak = false
  191. }
  192. }
  193. if shouldIncrementStreak {
  194. h.FailingStreak++
  195. if h.FailingStreak >= retries {
  196. h.SetStatus(types.Unhealthy)
  197. }
  198. }
  199. // Else we're starting or healthy. Stay in that state.
  200. }
  201. // replicate Health status changes
  202. if err := c.CheckpointTo(d.containersReplica); err != nil {
  203. // queries will be inconsistent until the next probe runs or other state mutations
  204. // checkpoint the container
  205. log.G(context.TODO()).Errorf("Error replicating health state for container %s: %v", c.ID, err)
  206. }
  207. current := h.Status()
  208. if oldStatus != current {
  209. d.LogContainerEvent(c, events.Action(string(events.ActionHealthStatus)+": "+current))
  210. }
  211. }
  212. // Run the container's monitoring thread until notified via "stop".
  213. // There is never more than one monitor thread running per container at a time.
  214. func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
  215. probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
  216. startInterval := timeoutWithDefault(c.Config.Healthcheck.StartInterval, probeInterval)
  217. startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod)
  218. c.Lock()
  219. started := c.State.StartedAt
  220. c.Unlock()
  221. getInterval := func() time.Duration {
  222. if time.Since(started) >= startPeriod {
  223. return probeInterval
  224. }
  225. c.Lock()
  226. status := c.Health.Health.Status
  227. c.Unlock()
  228. if status == types.Starting {
  229. return startInterval
  230. }
  231. return probeInterval
  232. }
  233. intervalTimer := time.NewTimer(getInterval())
  234. defer intervalTimer.Stop()
  235. for {
  236. select {
  237. case <-stop:
  238. log.G(context.TODO()).Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
  239. return
  240. case <-intervalTimer.C:
  241. log.G(context.TODO()).Debugf("Running health check for container %s ...", c.ID)
  242. startTime := time.Now()
  243. ctx, cancelProbe := context.WithCancel(context.Background())
  244. results := make(chan *types.HealthcheckResult, 1)
  245. go func() {
  246. healthChecksCounter.Inc()
  247. result, err := probe.run(ctx, d, c)
  248. if err != nil {
  249. healthChecksFailedCounter.Inc()
  250. log.G(ctx).Warnf("Health check for container %s error: %v", c.ID, err)
  251. results <- &types.HealthcheckResult{
  252. ExitCode: -1,
  253. Output: err.Error(),
  254. Start: startTime,
  255. End: time.Now(),
  256. }
  257. } else {
  258. result.Start = startTime
  259. log.G(ctx).Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode)
  260. results <- result
  261. }
  262. close(results)
  263. }()
  264. select {
  265. case <-stop:
  266. log.G(ctx).Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID)
  267. cancelProbe()
  268. // Wait for probe to exit (it might take a while to respond to the TERM
  269. // signal and we don't want dying probes to pile up).
  270. <-results
  271. return
  272. case result := <-results:
  273. handleProbeResult(d, c, result, stop)
  274. cancelProbe()
  275. }
  276. }
  277. intervalTimer.Reset(getInterval())
  278. }
  279. }
  280. // Get a suitable probe implementation for the container's healthcheck configuration.
  281. // Nil will be returned if no healthcheck was configured or NONE was set.
  282. func getProbe(c *container.Container) probe {
  283. config := c.Config.Healthcheck
  284. if config == nil || len(config.Test) == 0 {
  285. return nil
  286. }
  287. switch config.Test[0] {
  288. case "CMD":
  289. return &cmdProbe{shell: false}
  290. case "CMD-SHELL":
  291. return &cmdProbe{shell: true}
  292. case "NONE":
  293. return nil
  294. default:
  295. log.G(context.TODO()).Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID)
  296. return nil
  297. }
  298. }
  299. // Ensure the health-check monitor is running or not, depending on the current
  300. // state of the container.
  301. // Called from monitor.go, with c locked.
  302. func (daemon *Daemon) updateHealthMonitor(c *container.Container) {
  303. h := c.State.Health
  304. if h == nil {
  305. return // No healthcheck configured
  306. }
  307. probe := getProbe(c)
  308. wantRunning := c.Running && !c.Paused && probe != nil
  309. if wantRunning {
  310. if stop := h.OpenMonitorChannel(); stop != nil {
  311. go monitor(daemon, c, stop, probe)
  312. }
  313. } else {
  314. h.CloseMonitorChannel()
  315. }
  316. }
  317. // Reset the health state for a newly-started, restarted or restored container.
  318. // initHealthMonitor is called from monitor.go and we should never be running
  319. // two instances at once.
  320. // Called with c locked.
  321. func (daemon *Daemon) initHealthMonitor(c *container.Container) {
  322. // If no healthcheck is setup then don't init the monitor
  323. if getProbe(c) == nil {
  324. return
  325. }
  326. // This is needed in case we're auto-restarting
  327. daemon.stopHealthchecks(c)
  328. if h := c.State.Health; h != nil {
  329. h.SetStatus(types.Starting)
  330. h.FailingStreak = 0
  331. } else {
  332. h := &container.Health{}
  333. h.SetStatus(types.Starting)
  334. c.State.Health = h
  335. }
  336. daemon.updateHealthMonitor(c)
  337. }
  338. // Called when the container is being stopped (whether because the health check is
  339. // failing or for any other reason).
  340. func (daemon *Daemon) stopHealthchecks(c *container.Container) {
  341. h := c.State.Health
  342. if h != nil {
  343. h.CloseMonitorChannel()
  344. }
  345. }
  346. // Buffer up to maxOutputLen bytes. Further data is discarded.
  347. type limitedBuffer struct {
  348. buf bytes.Buffer
  349. mu sync.Mutex
  350. truncated bool // indicates that data has been lost
  351. }
  352. // Append to limitedBuffer while there is room.
  353. func (b *limitedBuffer) Write(data []byte) (int, error) {
  354. b.mu.Lock()
  355. defer b.mu.Unlock()
  356. bufLen := b.buf.Len()
  357. dataLen := len(data)
  358. keep := minInt(maxOutputLen-bufLen, dataLen)
  359. if keep > 0 {
  360. b.buf.Write(data[:keep])
  361. }
  362. if keep < dataLen {
  363. b.truncated = true
  364. }
  365. return dataLen, nil
  366. }
  367. // The contents of the buffer, with "..." appended if it overflowed.
  368. func (b *limitedBuffer) String() string {
  369. b.mu.Lock()
  370. defer b.mu.Unlock()
  371. out := b.buf.String()
  372. if b.truncated {
  373. out = out + "..."
  374. }
  375. return out
  376. }
  377. // If configuredValue is zero, use defaultValue instead.
  378. func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
  379. if configuredValue == 0 {
  380. return defaultValue
  381. }
  382. return configuredValue
  383. }
  384. func minInt(x, y int) int {
  385. if x < y {
  386. return x
  387. }
  388. return y
  389. }
  390. func getShell(cntr *container.Container) []string {
  391. if len(cntr.Config.Shell) != 0 {
  392. return cntr.Config.Shell
  393. }
  394. if runtime.GOOS != "windows" {
  395. return []string{"/bin/sh", "-c"}
  396. }
  397. if cntr.OS != runtime.GOOS {
  398. return []string{"/bin/sh", "-c"}
  399. }
  400. return []string{"cmd", "/S", "/C"}
  401. }