monitor.go 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339
  1. package daemon
  2. import (
  3. "io"
  4. "os/exec"
  5. "sync"
  6. "time"
  7. "github.com/Sirupsen/logrus"
  8. "github.com/docker/docker/daemon/execdriver"
  9. "github.com/docker/docker/pkg/stringid"
  10. "github.com/docker/docker/runconfig"
  11. )
  12. const (
  13. defaultTimeIncrement = 100
  14. loggerCloseTimeout = 10 * time.Second
  15. )
  16. // containerMonitor monitors the execution of a container's main process.
  17. // If a restart policy is specified for the container the monitor will ensure that the
  18. // process is restarted based on the rules of the policy. When the container is finally stopped
  19. // the monitor will reset and cleanup any of the container resources such as networking allocations
  20. // and the rootfs
  21. type containerMonitor struct {
  22. mux sync.Mutex
  23. // container is the container being monitored
  24. container *Container
  25. // restartPolicy is the current policy being applied to the container monitor
  26. restartPolicy runconfig.RestartPolicy
  27. // failureCount is the number of times the container has failed to
  28. // start in a row
  29. failureCount int
  30. // shouldStop signals the monitor that the next time the container exits it is
  31. // either because docker or the user asked for the container to be stopped
  32. shouldStop bool
  33. // startSignal is a channel that is closes after the container initially starts
  34. startSignal chan struct{}
  35. // stopChan is used to signal to the monitor whenever there is a wait for the
  36. // next restart so that the timeIncrement is not honored and the user is not
  37. // left waiting for nothing to happen during this time
  38. stopChan chan struct{}
  39. // timeIncrement is the amount of time to wait between restarts
  40. // this is in milliseconds
  41. timeIncrement int
  42. // lastStartTime is the time which the monitor last exec'd the container's process
  43. lastStartTime time.Time
  44. }
  45. // newContainerMonitor returns an initialized containerMonitor for the provided container
  46. // honoring the provided restart policy
  47. func newContainerMonitor(container *Container, policy runconfig.RestartPolicy) *containerMonitor {
  48. return &containerMonitor{
  49. container: container,
  50. restartPolicy: policy,
  51. timeIncrement: defaultTimeIncrement,
  52. stopChan: make(chan struct{}),
  53. startSignal: make(chan struct{}),
  54. }
  55. }
  56. // Stop signals to the container monitor that it should stop monitoring the container
  57. // for exits the next time the process dies
  58. func (m *containerMonitor) ExitOnNext() {
  59. m.mux.Lock()
  60. // we need to protect having a double close of the channel when stop is called
  61. // twice or else we will get a panic
  62. if !m.shouldStop {
  63. m.shouldStop = true
  64. close(m.stopChan)
  65. }
  66. m.mux.Unlock()
  67. }
  68. // Close closes the container's resources such as networking allocations and
  69. // unmounts the contatiner's root filesystem
  70. func (m *containerMonitor) Close() error {
  71. // Cleanup networking and mounts
  72. m.container.cleanup()
  73. // FIXME: here is race condition between two RUN instructions in Dockerfile
  74. // because they share same runconfig and change image. Must be fixed
  75. // in builder/builder.go
  76. if err := m.container.toDisk(); err != nil {
  77. logrus.Errorf("Error dumping container %s state to disk: %s", m.container.ID, err)
  78. return err
  79. }
  80. return nil
  81. }
  82. // Start starts the containers process and monitors it according to the restart policy
  83. func (m *containerMonitor) Start() error {
  84. var (
  85. err error
  86. exitStatus execdriver.ExitStatus
  87. // this variable indicates where we in execution flow:
  88. // before Run or after
  89. afterRun bool
  90. )
  91. // ensure that when the monitor finally exits we release the networking and unmount the rootfs
  92. defer func() {
  93. if afterRun {
  94. m.container.Lock()
  95. m.container.setStopped(&exitStatus)
  96. defer m.container.Unlock()
  97. }
  98. m.Close()
  99. }()
  100. // reset the restart count
  101. m.container.RestartCount = -1
  102. for {
  103. m.container.RestartCount++
  104. if err := m.container.startLogging(); err != nil {
  105. m.resetContainer(false)
  106. return err
  107. }
  108. pipes := execdriver.NewPipes(m.container.stdin, m.container.stdout, m.container.stderr, m.container.Config.OpenStdin)
  109. m.container.LogEvent("start")
  110. m.lastStartTime = time.Now()
  111. if exitStatus, err = m.container.daemon.Run(m.container, pipes, m.callback); err != nil {
  112. // if we receive an internal error from the initial start of a container then lets
  113. // return it instead of entering the restart loop
  114. if m.container.RestartCount == 0 {
  115. m.container.ExitCode = -1
  116. m.resetContainer(false)
  117. return err
  118. }
  119. logrus.Errorf("Error running container: %s", err)
  120. }
  121. // here container.Lock is already lost
  122. afterRun = true
  123. m.resetMonitor(err == nil && exitStatus.ExitCode == 0)
  124. if m.shouldRestart(exitStatus.ExitCode) {
  125. m.container.SetRestarting(&exitStatus)
  126. if exitStatus.OOMKilled {
  127. m.container.LogEvent("oom")
  128. }
  129. m.container.LogEvent("die")
  130. m.resetContainer(true)
  131. // sleep with a small time increment between each restart to help avoid issues cased by quickly
  132. // restarting the container because of some types of errors ( networking cut out, etc... )
  133. m.waitForNextRestart()
  134. // we need to check this before reentering the loop because the waitForNextRestart could have
  135. // been terminated by a request from a user
  136. if m.shouldStop {
  137. return err
  138. }
  139. continue
  140. }
  141. if exitStatus.OOMKilled {
  142. m.container.LogEvent("oom")
  143. }
  144. m.container.LogEvent("die")
  145. m.resetContainer(true)
  146. return err
  147. }
  148. }
  149. // resetMonitor resets the stateful fields on the containerMonitor based on the
  150. // previous runs success or failure. Regardless of success, if the container had
  151. // an execution time of more than 10s then reset the timer back to the default
  152. func (m *containerMonitor) resetMonitor(successful bool) {
  153. executionTime := time.Now().Sub(m.lastStartTime).Seconds()
  154. if executionTime > 10 {
  155. m.timeIncrement = defaultTimeIncrement
  156. } else {
  157. // otherwise we need to increment the amount of time we wait before restarting
  158. // the process. We will build up by multiplying the increment by 2
  159. m.timeIncrement *= 2
  160. }
  161. // the container exited successfully so we need to reset the failure counter
  162. if successful {
  163. m.failureCount = 0
  164. } else {
  165. m.failureCount++
  166. }
  167. }
  168. // waitForNextRestart waits with the default time increment to restart the container unless
  169. // a user or docker asks for the container to be stopped
  170. func (m *containerMonitor) waitForNextRestart() {
  171. select {
  172. case <-time.After(time.Duration(m.timeIncrement) * time.Millisecond):
  173. case <-m.stopChan:
  174. }
  175. }
  176. // shouldRestart checks the restart policy and applies the rules to determine if
  177. // the container's process should be restarted
  178. func (m *containerMonitor) shouldRestart(exitCode int) bool {
  179. m.mux.Lock()
  180. defer m.mux.Unlock()
  181. // do not restart if the user or docker has requested that this container be stopped
  182. if m.shouldStop {
  183. return false
  184. }
  185. switch {
  186. case m.restartPolicy.IsAlways():
  187. return true
  188. case m.restartPolicy.IsOnFailure():
  189. // the default value of 0 for MaximumRetryCount means that we will not enforce a maximum count
  190. if max := m.restartPolicy.MaximumRetryCount; max != 0 && m.failureCount > max {
  191. logrus.Debugf("stopping restart of container %s because maximum failure could of %d has been reached",
  192. stringid.TruncateID(m.container.ID), max)
  193. return false
  194. }
  195. return exitCode != 0
  196. }
  197. return false
  198. }
  199. // callback ensures that the container's state is properly updated after we
  200. // received ack from the execution drivers
  201. func (m *containerMonitor) callback(processConfig *execdriver.ProcessConfig, pid int) {
  202. if processConfig.Tty {
  203. // The callback is called after the process Start()
  204. // so we are in the parent process. In TTY mode, stdin/out/err is the PtySlave
  205. // which we close here.
  206. if c, ok := processConfig.Stdout.(io.Closer); ok {
  207. c.Close()
  208. }
  209. }
  210. m.container.setRunning(pid)
  211. // signal that the process has started
  212. // close channel only if not closed
  213. select {
  214. case <-m.startSignal:
  215. default:
  216. close(m.startSignal)
  217. }
  218. if err := m.container.ToDisk(); err != nil {
  219. logrus.Errorf("Error saving container to disk: %v", err)
  220. }
  221. }
  222. // resetContainer resets the container's IO and ensures that the command is able to be executed again
  223. // by copying the data into a new struct
  224. // if lock is true, then container locked during reset
  225. func (m *containerMonitor) resetContainer(lock bool) {
  226. container := m.container
  227. if lock {
  228. container.Lock()
  229. defer container.Unlock()
  230. }
  231. if container.Config.OpenStdin {
  232. if err := container.stdin.Close(); err != nil {
  233. logrus.Errorf("%s: Error close stdin: %s", container.ID, err)
  234. }
  235. }
  236. if err := container.stdout.Clean(); err != nil {
  237. logrus.Errorf("%s: Error close stdout: %s", container.ID, err)
  238. }
  239. if err := container.stderr.Clean(); err != nil {
  240. logrus.Errorf("%s: Error close stderr: %s", container.ID, err)
  241. }
  242. if container.command != nil && container.command.ProcessConfig.Terminal != nil {
  243. if err := container.command.ProcessConfig.Terminal.Close(); err != nil {
  244. logrus.Errorf("%s: Error closing terminal: %s", container.ID, err)
  245. }
  246. }
  247. // Re-create a brand new stdin pipe once the container exited
  248. if container.Config.OpenStdin {
  249. container.stdin, container.stdinPipe = io.Pipe()
  250. }
  251. if container.logDriver != nil {
  252. if container.logCopier != nil {
  253. exit := make(chan struct{})
  254. go func() {
  255. container.logCopier.Wait()
  256. close(exit)
  257. }()
  258. select {
  259. case <-time.After(loggerCloseTimeout):
  260. logrus.Warnf("Logger didn't exit in time: logs may be truncated")
  261. case <-exit:
  262. }
  263. }
  264. container.logDriver.Close()
  265. container.logCopier = nil
  266. container.logDriver = nil
  267. }
  268. c := container.command.ProcessConfig.Cmd
  269. container.command.ProcessConfig.Cmd = exec.Cmd{
  270. Stdin: c.Stdin,
  271. Stdout: c.Stdout,
  272. Stderr: c.Stderr,
  273. Path: c.Path,
  274. Env: c.Env,
  275. ExtraFiles: c.ExtraFiles,
  276. Args: c.Args,
  277. Dir: c.Dir,
  278. SysProcAttr: c.SysProcAttr,
  279. }
  280. }