monitor.go 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309
  1. package daemon
  2. import (
  3. "io"
  4. "os/exec"
  5. "sync"
  6. "time"
  7. "github.com/docker/docker/daemon/execdriver"
  8. "github.com/docker/docker/pkg/log"
  9. "github.com/docker/docker/runconfig"
  10. )
  11. const defaultTimeIncrement = 100
  12. // containerMonitor monitors the execution of a container's main process.
  13. // If a restart policy is specified for the cotnainer the monitor will ensure that the
  14. // process is restarted based on the rules of the policy. When the container is finally stopped
  15. // the monitor will reset and cleanup any of the container resources such as networking allocations
  16. // and the rootfs
  17. type containerMonitor struct {
  18. mux sync.Mutex
  19. // container is the container being monitored
  20. container *Container
  21. // restartPolicy is the current policy being applied to the container monitor
  22. restartPolicy runconfig.RestartPolicy
  23. // failureCount is the number of times the container has failed to
  24. // start in a row
  25. failureCount int
  26. // shouldStop signals the monitor that the next time the container exits it is
  27. // either because docker or the user asked for the container to be stopped
  28. shouldStop bool
  29. // startSignal is a channel that is closes after the container initially starts
  30. startSignal chan struct{}
  31. // stopChan is used to signal to the monitor whenever there is a wait for the
  32. // next restart so that the timeIncrement is not honored and the user is not
  33. // left waiting for nothing to happen during this time
  34. stopChan chan struct{}
  35. // timeIncrement is the amount of time to wait between restarts
  36. // this is in milliseconds
  37. timeIncrement int
  38. // lastStartTime is the time which the monitor last exec'd the container's process
  39. lastStartTime time.Time
  40. }
  41. // newContainerMonitor returns an initialized containerMonitor for the provided container
  42. // honoring the provided restart policy
  43. func newContainerMonitor(container *Container, policy runconfig.RestartPolicy) *containerMonitor {
  44. return &containerMonitor{
  45. container: container,
  46. restartPolicy: policy,
  47. timeIncrement: defaultTimeIncrement,
  48. stopChan: make(chan struct{}),
  49. startSignal: make(chan struct{}),
  50. }
  51. }
  52. // Stop signals to the container monitor that it should stop monitoring the container
  53. // for exits the next time the process dies
  54. func (m *containerMonitor) ExitOnNext() {
  55. m.mux.Lock()
  56. // we need to protect having a double close of the channel when stop is called
  57. // twice or else we will get a panic
  58. if !m.shouldStop {
  59. m.shouldStop = true
  60. close(m.stopChan)
  61. }
  62. m.mux.Unlock()
  63. }
  64. // Close closes the container's resources such as networking allocations and
  65. // unmounts the contatiner's root filesystem
  66. func (m *containerMonitor) Close() error {
  67. // Cleanup networking and mounts
  68. m.container.cleanup()
  69. // FIXME: here is race condition between two RUN instructions in Dockerfile
  70. // because they share same runconfig and change image. Must be fixed
  71. // in builder/builder.go
  72. if err := m.container.toDisk(); err != nil {
  73. log.Errorf("Error dumping container %s state to disk: %s", m.container.ID, err)
  74. return err
  75. }
  76. return nil
  77. }
  78. // Start starts the containers process and monitors it according to the restart policy
  79. func (m *containerMonitor) Start() error {
  80. var (
  81. err error
  82. exitStatus int
  83. // this variable indicates where we in execution flow:
  84. // before Run or after
  85. afterRun bool
  86. )
  87. // ensure that when the monitor finally exits we release the networking and unmount the rootfs
  88. defer func() {
  89. if afterRun {
  90. m.container.Lock()
  91. m.container.State.setStopped(exitStatus)
  92. defer m.container.Unlock()
  93. }
  94. m.Close()
  95. }()
  96. // reset the restart count
  97. m.container.RestartCount = -1
  98. for {
  99. m.container.RestartCount++
  100. if err := m.container.startLoggingToDisk(); err != nil {
  101. m.resetContainer(false)
  102. return err
  103. }
  104. pipes := execdriver.NewPipes(m.container.stdin, m.container.stdout, m.container.stderr, m.container.Config.OpenStdin)
  105. m.container.LogEvent("start")
  106. m.lastStartTime = time.Now()
  107. if exitStatus, err = m.container.daemon.Run(m.container, pipes, m.callback); err != nil {
  108. // if we receive an internal error from the initial start of a container then lets
  109. // return it instead of entering the restart loop
  110. if m.container.RestartCount == 0 {
  111. m.resetContainer(false)
  112. return err
  113. }
  114. log.Errorf("Error running container: %s", err)
  115. }
  116. // here container.Lock is already lost
  117. afterRun = true
  118. m.resetMonitor(err == nil && exitStatus == 0)
  119. if m.shouldRestart(exitStatus) {
  120. m.container.State.SetRestarting(exitStatus)
  121. m.container.LogEvent("die")
  122. m.resetContainer(true)
  123. // sleep with a small time increment between each restart to help avoid issues cased by quickly
  124. // restarting the container because of some types of errors ( networking cut out, etc... )
  125. m.waitForNextRestart()
  126. // we need to check this before reentering the loop because the waitForNextRestart could have
  127. // been terminated by a request from a user
  128. if m.shouldStop {
  129. return err
  130. }
  131. continue
  132. }
  133. m.container.LogEvent("die")
  134. m.resetContainer(true)
  135. return err
  136. }
  137. }
  138. // resetMonitor resets the stateful fields on the containerMonitor based on the
  139. // previous runs success or failure. Reguardless of success, if the container had
  140. // an execution time of more than 10s then reset the timer back to the default
  141. func (m *containerMonitor) resetMonitor(successful bool) {
  142. executionTime := time.Now().Sub(m.lastStartTime).Seconds()
  143. if executionTime > 10 {
  144. m.timeIncrement = defaultTimeIncrement
  145. } else {
  146. // otherwise we need to increment the amount of time we wait before restarting
  147. // the process. We will build up by multiplying the increment by 2
  148. m.timeIncrement *= 2
  149. }
  150. // the container exited successfully so we need to reset the failure counter
  151. if successful {
  152. m.failureCount = 0
  153. } else {
  154. m.failureCount++
  155. }
  156. }
  157. // waitForNextRestart waits with the default time increment to restart the container unless
  158. // a user or docker asks for the container to be stopped
  159. func (m *containerMonitor) waitForNextRestart() {
  160. select {
  161. case <-time.After(time.Duration(m.timeIncrement) * time.Millisecond):
  162. case <-m.stopChan:
  163. }
  164. }
  165. // shouldRestart checks the restart policy and applies the rules to determine if
  166. // the container's process should be restarted
  167. func (m *containerMonitor) shouldRestart(exitStatus int) bool {
  168. m.mux.Lock()
  169. defer m.mux.Unlock()
  170. // do not restart if the user or docker has requested that this container be stopped
  171. if m.shouldStop {
  172. return false
  173. }
  174. switch m.restartPolicy.Name {
  175. case "always":
  176. return true
  177. case "on-failure":
  178. // the default value of 0 for MaximumRetryCount means that we will not enforce a maximum count
  179. if max := m.restartPolicy.MaximumRetryCount; max != 0 && m.failureCount >= max {
  180. log.Debugf("stopping restart of container %s because maximum failure could of %d has been reached", max)
  181. return false
  182. }
  183. return exitStatus != 0
  184. }
  185. return false
  186. }
  187. // callback ensures that the container's state is properly updated after we
  188. // received ack from the execution drivers
  189. func (m *containerMonitor) callback(command *execdriver.Command) {
  190. if command.Tty {
  191. // The callback is called after the process Start()
  192. // so we are in the parent process. In TTY mode, stdin/out/err is the PtySlace
  193. // which we close here.
  194. if c, ok := command.Stdout.(io.Closer); ok {
  195. c.Close()
  196. }
  197. }
  198. m.container.State.setRunning(command.Pid())
  199. // signal that the process has started
  200. // close channel only if not closed
  201. select {
  202. case <-m.startSignal:
  203. default:
  204. close(m.startSignal)
  205. }
  206. if err := m.container.ToDisk(); err != nil {
  207. log.Debugf("%s", err)
  208. }
  209. }
  210. // resetContainer resets the container's IO and ensures that the command is able to be executed again
  211. // by copying the data into a new struct
  212. // if lock is true, then container locked during reset
  213. func (m *containerMonitor) resetContainer(lock bool) {
  214. container := m.container
  215. if lock {
  216. container.Lock()
  217. defer container.Unlock()
  218. }
  219. if container.Config.OpenStdin {
  220. if err := container.stdin.Close(); err != nil {
  221. log.Errorf("%s: Error close stdin: %s", container.ID, err)
  222. }
  223. }
  224. if err := container.stdout.Clean(); err != nil {
  225. log.Errorf("%s: Error close stdout: %s", container.ID, err)
  226. }
  227. if err := container.stderr.Clean(); err != nil {
  228. log.Errorf("%s: Error close stderr: %s", container.ID, err)
  229. }
  230. if container.command != nil && container.command.Terminal != nil {
  231. if err := container.command.Terminal.Close(); err != nil {
  232. log.Errorf("%s: Error closing terminal: %s", container.ID, err)
  233. }
  234. }
  235. // Re-create a brand new stdin pipe once the container exited
  236. if container.Config.OpenStdin {
  237. container.stdin, container.stdinPipe = io.Pipe()
  238. }
  239. c := container.command.Cmd
  240. container.command.Cmd = exec.Cmd{
  241. Stdin: c.Stdin,
  242. Stdout: c.Stdout,
  243. Stderr: c.Stderr,
  244. Path: c.Path,
  245. Env: c.Env,
  246. ExtraFiles: c.ExtraFiles,
  247. Args: c.Args,
  248. Dir: c.Dir,
  249. SysProcAttr: c.SysProcAttr,
  250. }
  251. }