health.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. package daemon // import "github.com/docker/docker/daemon"
  2. import (
  3. "bytes"
  4. "context"
  5. "fmt"
  6. "runtime"
  7. "strings"
  8. "sync"
  9. "time"
  10. "github.com/docker/docker/api/types"
  11. containertypes "github.com/docker/docker/api/types/container"
  12. "github.com/docker/docker/api/types/strslice"
  13. "github.com/docker/docker/container"
  14. "github.com/sirupsen/logrus"
  15. )
  16. const (
  17. // Longest healthcheck probe output message to store. Longer messages will be truncated.
  18. maxOutputLen = 4096
  19. // Default interval between probe runs (from the end of the first to the start of the second).
  20. // Also the time before the first probe.
  21. defaultProbeInterval = 30 * time.Second
  22. // The maximum length of time a single probe run should take. If the probe takes longer
  23. // than this, the check is considered to have failed.
  24. defaultProbeTimeout = 30 * time.Second
  25. // The time given for the container to start before the health check starts considering
  26. // the container unstable. Defaults to none.
  27. defaultStartPeriod = 0 * time.Second
  28. // Default number of consecutive failures of the health check
  29. // for the container to be considered unhealthy.
  30. defaultProbeRetries = 3
  31. // Maximum number of entries to record
  32. maxLogEntries = 5
  33. )
  34. const (
  35. // Exit status codes that can be returned by the probe command.
  36. exitStatusHealthy = 0 // Container is healthy
  37. )
  38. // probe implementations know how to run a particular type of probe.
  39. type probe interface {
  40. // Perform one run of the check. Returns the exit code and an optional
  41. // short diagnostic string.
  42. run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
  43. }
  44. // cmdProbe implements the "CMD" probe type.
  45. type cmdProbe struct {
  46. // Run the command with the system's default shell instead of execing it directly.
  47. shell bool
  48. }
  49. // exec the healthcheck command in the container.
  50. // Returns the exit code and probe output (if any)
  51. func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) {
  52. startTime := time.Now()
  53. cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:]
  54. if p.shell {
  55. cmdSlice = append(getShell(cntr), cmdSlice...)
  56. }
  57. entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
  58. execConfig := container.NewExecConfig(cntr)
  59. execConfig.OpenStdin = false
  60. execConfig.OpenStdout = true
  61. execConfig.OpenStderr = true
  62. execConfig.DetachKeys = []byte{}
  63. execConfig.Entrypoint = entrypoint
  64. execConfig.Args = args
  65. execConfig.Tty = false
  66. execConfig.Privileged = false
  67. execConfig.User = cntr.Config.User
  68. execConfig.WorkingDir = cntr.Config.WorkingDir
  69. linkedEnv, err := d.setupLinkedContainers(cntr)
  70. if err != nil {
  71. return nil, err
  72. }
  73. execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env)
  74. d.registerExecCommand(cntr, execConfig)
  75. attributes := map[string]string{
  76. "execID": execConfig.ID,
  77. }
  78. d.LogContainerEventWithAttributes(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "), attributes)
  79. output := &limitedBuffer{}
  80. probeCtx, cancelProbe := context.WithCancel(ctx)
  81. defer cancelProbe()
  82. execErr := make(chan error, 1)
  83. options := containertypes.ExecStartOptions{
  84. Stdout: output,
  85. Stderr: output,
  86. }
  87. go func() { execErr <- d.ContainerExecStart(probeCtx, execConfig.ID, options) }()
  88. // Starting an exec can take a significant amount of time: on the order
  89. // of 1s in extreme cases. The time it takes dockerd and containerd to
  90. // start the exec is time that the probe process is not running, and so
  91. // should not count towards the health check's timeout. Apply a separate
  92. // timeout to abort if the exec request is wedged.
  93. tm := time.NewTimer(30 * time.Second)
  94. defer tm.Stop()
  95. select {
  96. case <-tm.C:
  97. return nil, fmt.Errorf("timed out starting health check for container %s", cntr.ID)
  98. case err := <-execErr:
  99. if err != nil {
  100. return nil, err
  101. }
  102. case <-execConfig.Started:
  103. healthCheckStartDuration.UpdateSince(startTime)
  104. }
  105. if !tm.Stop() {
  106. <-tm.C
  107. }
  108. probeTimeout := timeoutWithDefault(cntr.Config.Healthcheck.Timeout, defaultProbeTimeout)
  109. tm.Reset(probeTimeout)
  110. select {
  111. case <-tm.C:
  112. cancelProbe()
  113. logrus.WithContext(ctx).Debugf("Health check for container %s taking too long", cntr.ID)
  114. // Wait for probe to exit (it might take some time to call containerd to kill
  115. // the process and we don't want dying probes to pile up).
  116. <-execErr
  117. var msg string
  118. if out := output.String(); len(out) > 0 {
  119. msg = fmt.Sprintf("Health check exceeded timeout (%v): %s", probeTimeout, out)
  120. } else {
  121. msg = fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout)
  122. }
  123. return &types.HealthcheckResult{
  124. ExitCode: -1,
  125. Output: msg,
  126. End: time.Now(),
  127. }, nil
  128. case err := <-execErr:
  129. if err != nil {
  130. return nil, err
  131. }
  132. }
  133. info, err := d.getExecConfig(execConfig.ID)
  134. if err != nil {
  135. return nil, err
  136. }
  137. exitCode, err := func() (int, error) {
  138. info.Lock()
  139. defer info.Unlock()
  140. if info.ExitCode == nil {
  141. info.Unlock()
  142. return 0, fmt.Errorf("healthcheck for container %s has no exit code", cntr.ID)
  143. }
  144. return *info.ExitCode, nil
  145. }()
  146. if err != nil {
  147. return nil, err
  148. }
  149. // Note: Go's json package will handle invalid UTF-8 for us
  150. out := output.String()
  151. return &types.HealthcheckResult{
  152. End: time.Now(),
  153. ExitCode: exitCode,
  154. Output: out,
  155. }, nil
  156. }
  157. // Update the container's Status.Health struct based on the latest probe's result.
  158. func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) {
  159. c.Lock()
  160. defer c.Unlock()
  161. // probe may have been cancelled while waiting on lock. Ignore result then
  162. select {
  163. case <-done:
  164. return
  165. default:
  166. }
  167. retries := c.Config.Healthcheck.Retries
  168. if retries <= 0 {
  169. retries = defaultProbeRetries
  170. }
  171. h := c.State.Health
  172. oldStatus := h.Status()
  173. if len(h.Log) >= maxLogEntries {
  174. h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
  175. } else {
  176. h.Log = append(h.Log, result)
  177. }
  178. if result.ExitCode == exitStatusHealthy {
  179. h.FailingStreak = 0
  180. h.SetStatus(types.Healthy)
  181. } else { // Failure (including invalid exit code)
  182. shouldIncrementStreak := true
  183. // If the container is starting (i.e. we never had a successful health check)
  184. // then we check if we are within the start period of the container in which
  185. // case we do not increment the failure streak.
  186. if h.Status() == types.Starting {
  187. startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod)
  188. timeSinceStart := result.Start.Sub(c.State.StartedAt)
  189. // If still within the start period, then don't increment failing streak.
  190. if timeSinceStart < startPeriod {
  191. shouldIncrementStreak = false
  192. }
  193. }
  194. if shouldIncrementStreak {
  195. h.FailingStreak++
  196. if h.FailingStreak >= retries {
  197. h.SetStatus(types.Unhealthy)
  198. }
  199. }
  200. // Else we're starting or healthy. Stay in that state.
  201. }
  202. // replicate Health status changes
  203. if err := c.CheckpointTo(d.containersReplica); err != nil {
  204. // queries will be inconsistent until the next probe runs or other state mutations
  205. // checkpoint the container
  206. logrus.Errorf("Error replicating health state for container %s: %v", c.ID, err)
  207. }
  208. current := h.Status()
  209. if oldStatus != current {
  210. d.LogContainerEvent(c, "health_status: "+current)
  211. }
  212. }
  213. // Run the container's monitoring thread until notified via "stop".
  214. // There is never more than one monitor thread running per container at a time.
  215. func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
  216. probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
  217. intervalTimer := time.NewTimer(probeInterval)
  218. defer intervalTimer.Stop()
  219. for {
  220. intervalTimer.Reset(probeInterval)
  221. select {
  222. case <-stop:
  223. logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
  224. return
  225. case <-intervalTimer.C:
  226. logrus.Debugf("Running health check for container %s ...", c.ID)
  227. startTime := time.Now()
  228. ctx, cancelProbe := context.WithCancel(context.Background())
  229. results := make(chan *types.HealthcheckResult, 1)
  230. go func() {
  231. healthChecksCounter.Inc()
  232. result, err := probe.run(ctx, d, c)
  233. if err != nil {
  234. healthChecksFailedCounter.Inc()
  235. logrus.Warnf("Health check for container %s error: %v", c.ID, err)
  236. results <- &types.HealthcheckResult{
  237. ExitCode: -1,
  238. Output: err.Error(),
  239. Start: startTime,
  240. End: time.Now(),
  241. }
  242. } else {
  243. result.Start = startTime
  244. logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode)
  245. results <- result
  246. }
  247. close(results)
  248. }()
  249. select {
  250. case <-stop:
  251. logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID)
  252. cancelProbe()
  253. // Wait for probe to exit (it might take a while to respond to the TERM
  254. // signal and we don't want dying probes to pile up).
  255. <-results
  256. return
  257. case result := <-results:
  258. handleProbeResult(d, c, result, stop)
  259. cancelProbe()
  260. }
  261. }
  262. }
  263. }
  264. // Get a suitable probe implementation for the container's healthcheck configuration.
  265. // Nil will be returned if no healthcheck was configured or NONE was set.
  266. func getProbe(c *container.Container) probe {
  267. config := c.Config.Healthcheck
  268. if config == nil || len(config.Test) == 0 {
  269. return nil
  270. }
  271. switch config.Test[0] {
  272. case "CMD":
  273. return &cmdProbe{shell: false}
  274. case "CMD-SHELL":
  275. return &cmdProbe{shell: true}
  276. case "NONE":
  277. return nil
  278. default:
  279. logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID)
  280. return nil
  281. }
  282. }
  283. // Ensure the health-check monitor is running or not, depending on the current
  284. // state of the container.
  285. // Called from monitor.go, with c locked.
  286. func (daemon *Daemon) updateHealthMonitor(c *container.Container) {
  287. h := c.State.Health
  288. if h == nil {
  289. return // No healthcheck configured
  290. }
  291. probe := getProbe(c)
  292. wantRunning := c.Running && !c.Paused && probe != nil
  293. if wantRunning {
  294. if stop := h.OpenMonitorChannel(); stop != nil {
  295. go monitor(daemon, c, stop, probe)
  296. }
  297. } else {
  298. h.CloseMonitorChannel()
  299. }
  300. }
  301. // Reset the health state for a newly-started, restarted or restored container.
  302. // initHealthMonitor is called from monitor.go and we should never be running
  303. // two instances at once.
  304. // Called with c locked.
  305. func (daemon *Daemon) initHealthMonitor(c *container.Container) {
  306. // If no healthcheck is setup then don't init the monitor
  307. if getProbe(c) == nil {
  308. return
  309. }
  310. // This is needed in case we're auto-restarting
  311. daemon.stopHealthchecks(c)
  312. if h := c.State.Health; h != nil {
  313. h.SetStatus(types.Starting)
  314. h.FailingStreak = 0
  315. } else {
  316. h := &container.Health{}
  317. h.SetStatus(types.Starting)
  318. c.State.Health = h
  319. }
  320. daemon.updateHealthMonitor(c)
  321. }
  322. // Called when the container is being stopped (whether because the health check is
  323. // failing or for any other reason).
  324. func (daemon *Daemon) stopHealthchecks(c *container.Container) {
  325. h := c.State.Health
  326. if h != nil {
  327. h.CloseMonitorChannel()
  328. }
  329. }
  330. // Buffer up to maxOutputLen bytes. Further data is discarded.
  331. type limitedBuffer struct {
  332. buf bytes.Buffer
  333. mu sync.Mutex
  334. truncated bool // indicates that data has been lost
  335. }
  336. // Append to limitedBuffer while there is room.
  337. func (b *limitedBuffer) Write(data []byte) (int, error) {
  338. b.mu.Lock()
  339. defer b.mu.Unlock()
  340. bufLen := b.buf.Len()
  341. dataLen := len(data)
  342. keep := min(maxOutputLen-bufLen, dataLen)
  343. if keep > 0 {
  344. b.buf.Write(data[:keep])
  345. }
  346. if keep < dataLen {
  347. b.truncated = true
  348. }
  349. return dataLen, nil
  350. }
  351. // The contents of the buffer, with "..." appended if it overflowed.
  352. func (b *limitedBuffer) String() string {
  353. b.mu.Lock()
  354. defer b.mu.Unlock()
  355. out := b.buf.String()
  356. if b.truncated {
  357. out = out + "..."
  358. }
  359. return out
  360. }
  361. // If configuredValue is zero, use defaultValue instead.
  362. func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
  363. if configuredValue == 0 {
  364. return defaultValue
  365. }
  366. return configuredValue
  367. }
  368. func min(x, y int) int {
  369. if x < y {
  370. return x
  371. }
  372. return y
  373. }
  374. func getShell(cntr *container.Container) []string {
  375. if len(cntr.Config.Shell) != 0 {
  376. return cntr.Config.Shell
  377. }
  378. if runtime.GOOS != "windows" {
  379. return []string{"/bin/sh", "-c"}
  380. }
  381. if cntr.OS != runtime.GOOS {
  382. return []string{"/bin/sh", "-c"}
  383. }
  384. return []string{"cmd", "/S", "/C"}
  385. }