remote_linux.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. package libcontainerd
  2. import (
  3. "fmt"
  4. "io"
  5. "io/ioutil"
  6. "log"
  7. "net"
  8. "os"
  9. "os/exec"
  10. "path/filepath"
  11. "strconv"
  12. "sync"
  13. "syscall"
  14. "time"
  15. "github.com/Sirupsen/logrus"
  16. containerd "github.com/docker/containerd/api/grpc/types"
  17. "github.com/docker/docker/pkg/locker"
  18. sysinfo "github.com/docker/docker/pkg/system"
  19. "github.com/docker/docker/utils"
  20. "golang.org/x/net/context"
  21. "google.golang.org/grpc"
  22. "google.golang.org/grpc/grpclog"
  23. )
  24. const (
  25. maxConnectionRetryCount = 3
  26. connectionRetryDelay = 3 * time.Second
  27. containerdShutdownTimeout = 15 * time.Second
  28. containerdBinary = "docker-containerd"
  29. containerdPidFilename = "docker-containerd.pid"
  30. containerdSockFilename = "docker-containerd.sock"
  31. eventTimestampFilename = "event.ts"
  32. )
  33. type remote struct {
  34. sync.RWMutex
  35. apiClient containerd.APIClient
  36. daemonPid int
  37. stateDir string
  38. rpcAddr string
  39. startDaemon bool
  40. debugLog bool
  41. rpcConn *grpc.ClientConn
  42. clients []*client
  43. eventTsPath string
  44. pastEvents map[string]*containerd.Event
  45. runtimeArgs []string
  46. }
  47. // New creates a fresh instance of libcontainerd remote.
  48. func New(stateDir string, options ...RemoteOption) (_ Remote, err error) {
  49. defer func() {
  50. if err != nil {
  51. err = fmt.Errorf("Failed to connect to containerd. Please make sure containerd is installed in your PATH or you have specificed the correct address. Got error: %v", err)
  52. }
  53. }()
  54. r := &remote{
  55. stateDir: stateDir,
  56. daemonPid: -1,
  57. eventTsPath: filepath.Join(stateDir, eventTimestampFilename),
  58. pastEvents: make(map[string]*containerd.Event),
  59. }
  60. for _, option := range options {
  61. if err := option.Apply(r); err != nil {
  62. return nil, err
  63. }
  64. }
  65. if err := sysinfo.MkdirAll(stateDir, 0700); err != nil {
  66. return nil, err
  67. }
  68. if r.rpcAddr == "" {
  69. r.rpcAddr = filepath.Join(stateDir, containerdSockFilename)
  70. }
  71. if r.startDaemon {
  72. if err := r.runContainerdDaemon(); err != nil {
  73. return nil, err
  74. }
  75. }
  76. // don't output the grpc reconnect logging
  77. grpclog.SetLogger(log.New(ioutil.Discard, "", log.LstdFlags))
  78. dialOpts := append([]grpc.DialOption{grpc.WithInsecure()},
  79. grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
  80. return net.DialTimeout("unix", addr, timeout)
  81. }),
  82. )
  83. conn, err := grpc.Dial(r.rpcAddr, dialOpts...)
  84. if err != nil {
  85. return nil, fmt.Errorf("error connecting to containerd: %v", err)
  86. }
  87. r.rpcConn = conn
  88. r.apiClient = containerd.NewAPIClient(conn)
  89. go r.handleConnectionChange()
  90. if err := r.startEventsMonitor(); err != nil {
  91. return nil, err
  92. }
  93. return r, nil
  94. }
  95. func (r *remote) handleConnectionChange() {
  96. var transientFailureCount = 0
  97. state := grpc.Idle
  98. for {
  99. s, err := r.rpcConn.WaitForStateChange(context.Background(), state)
  100. if err != nil {
  101. break
  102. }
  103. state = s
  104. logrus.Debugf("containerd connection state change: %v", s)
  105. if r.daemonPid != -1 {
  106. switch state {
  107. case grpc.TransientFailure:
  108. // Reset state to be notified of next failure
  109. transientFailureCount++
  110. if transientFailureCount >= maxConnectionRetryCount {
  111. transientFailureCount = 0
  112. if utils.IsProcessAlive(r.daemonPid) {
  113. utils.KillProcess(r.daemonPid)
  114. }
  115. if err := r.runContainerdDaemon(); err != nil { //FIXME: Handle error
  116. logrus.Errorf("error restarting containerd: %v", err)
  117. }
  118. } else {
  119. state = grpc.Idle
  120. time.Sleep(connectionRetryDelay)
  121. }
  122. case grpc.Shutdown:
  123. // Well, we asked for it to stop, just return
  124. return
  125. }
  126. }
  127. }
  128. }
  129. func (r *remote) Cleanup() {
  130. if r.daemonPid == -1 {
  131. return
  132. }
  133. r.rpcConn.Close()
  134. // Ask the daemon to quit
  135. syscall.Kill(r.daemonPid, syscall.SIGTERM)
  136. // Wait up to 15secs for it to stop
  137. for i := time.Duration(0); i < containerdShutdownTimeout; i += time.Second {
  138. if !utils.IsProcessAlive(r.daemonPid) {
  139. break
  140. }
  141. time.Sleep(time.Second)
  142. }
  143. if utils.IsProcessAlive(r.daemonPid) {
  144. logrus.Warnf("libcontainerd: containerd (%d) didn't stop within 15 secs, killing it\n", r.daemonPid)
  145. syscall.Kill(r.daemonPid, syscall.SIGKILL)
  146. }
  147. // cleanup some files
  148. os.Remove(filepath.Join(r.stateDir, containerdPidFilename))
  149. os.Remove(filepath.Join(r.stateDir, containerdSockFilename))
  150. }
  151. func (r *remote) Client(b Backend) (Client, error) {
  152. c := &client{
  153. clientCommon: clientCommon{
  154. backend: b,
  155. containers: make(map[string]*container),
  156. locker: locker.New(),
  157. },
  158. remote: r,
  159. exitNotifiers: make(map[string]*exitNotifier),
  160. }
  161. r.Lock()
  162. r.clients = append(r.clients, c)
  163. r.Unlock()
  164. return c, nil
  165. }
  166. func (r *remote) updateEventTimestamp(t time.Time) {
  167. f, err := os.OpenFile(r.eventTsPath, syscall.O_CREAT|syscall.O_WRONLY|syscall.O_TRUNC, 0600)
  168. defer f.Close()
  169. if err != nil {
  170. logrus.Warnf("libcontainerd: failed to open event timestamp file: %v", err)
  171. return
  172. }
  173. b, err := t.MarshalText()
  174. if err != nil {
  175. logrus.Warnf("libcontainerd: failed to encode timestamp: %v", err)
  176. return
  177. }
  178. n, err := f.Write(b)
  179. if err != nil || n != len(b) {
  180. logrus.Warnf("libcontainerd: failed to update event timestamp file: %v", err)
  181. f.Truncate(0)
  182. return
  183. }
  184. }
  185. func (r *remote) getLastEventTimestamp() int64 {
  186. t := time.Now()
  187. fi, err := os.Stat(r.eventTsPath)
  188. if os.IsNotExist(err) || fi.Size() == 0 {
  189. return t.Unix()
  190. }
  191. f, err := os.Open(r.eventTsPath)
  192. defer f.Close()
  193. if err != nil {
  194. logrus.Warn("libcontainerd: Unable to access last event ts: %v", err)
  195. return t.Unix()
  196. }
  197. b := make([]byte, fi.Size())
  198. n, err := f.Read(b)
  199. if err != nil || n != len(b) {
  200. logrus.Warn("libcontainerd: Unable to read last event ts: %v", err)
  201. return t.Unix()
  202. }
  203. t.UnmarshalText(b)
  204. return t.Unix()
  205. }
  206. func (r *remote) startEventsMonitor() error {
  207. // First, get past events
  208. er := &containerd.EventsRequest{
  209. Timestamp: uint64(r.getLastEventTimestamp()),
  210. }
  211. events, err := r.apiClient.Events(context.Background(), er)
  212. if err != nil {
  213. return err
  214. }
  215. go r.handleEventStream(events)
  216. return nil
  217. }
  218. func (r *remote) handleEventStream(events containerd.API_EventsClient) {
  219. live := false
  220. for {
  221. e, err := events.Recv()
  222. if err != nil {
  223. logrus.Errorf("failed to receive event from containerd: %v", err)
  224. go r.startEventsMonitor()
  225. return
  226. }
  227. if live == false {
  228. logrus.Debugf("received past containerd event: %#v", e)
  229. // Pause/Resume events should never happens after exit one
  230. switch e.Type {
  231. case StateExit:
  232. r.pastEvents[e.Id] = e
  233. case StatePause:
  234. r.pastEvents[e.Id] = e
  235. case StateResume:
  236. r.pastEvents[e.Id] = e
  237. case stateLive:
  238. live = true
  239. r.updateEventTimestamp(time.Unix(int64(e.Timestamp), 0))
  240. }
  241. } else {
  242. logrus.Debugf("received containerd event: %#v", e)
  243. var container *container
  244. var c *client
  245. r.RLock()
  246. for _, c = range r.clients {
  247. container, err = c.getContainer(e.Id)
  248. if err == nil {
  249. break
  250. }
  251. }
  252. r.RUnlock()
  253. if container == nil {
  254. logrus.Errorf("no state for container: %q", err)
  255. continue
  256. }
  257. if err := container.handleEvent(e); err != nil {
  258. logrus.Errorf("error processing state change for %s: %v", e.Id, err)
  259. }
  260. r.updateEventTimestamp(time.Unix(int64(e.Timestamp), 0))
  261. }
  262. }
  263. }
  264. func (r *remote) runContainerdDaemon() error {
  265. pidFilename := filepath.Join(r.stateDir, containerdPidFilename)
  266. f, err := os.OpenFile(pidFilename, os.O_RDWR|os.O_CREATE, 0600)
  267. defer f.Close()
  268. if err != nil {
  269. return err
  270. }
  271. // File exist, check if the daemon is alive
  272. b := make([]byte, 8)
  273. n, err := f.Read(b)
  274. if err != nil && err != io.EOF {
  275. return err
  276. }
  277. if n > 0 {
  278. pid, err := strconv.ParseUint(string(b[:n]), 10, 64)
  279. if err != nil {
  280. return err
  281. }
  282. if utils.IsProcessAlive(int(pid)) {
  283. logrus.Infof("previous instance of containerd still alive (%d)", pid)
  284. r.daemonPid = int(pid)
  285. return nil
  286. }
  287. }
  288. // rewind the file
  289. _, err = f.Seek(0, os.SEEK_SET)
  290. if err != nil {
  291. return err
  292. }
  293. // Truncate it
  294. err = f.Truncate(0)
  295. if err != nil {
  296. return err
  297. }
  298. // Start a new instance
  299. args := []string{"-l", r.rpcAddr, "--runtime", "docker-runc"}
  300. if r.debugLog {
  301. args = append(args, "--debug", "--metrics-interval=0")
  302. }
  303. if len(r.runtimeArgs) > 0 {
  304. for _, v := range r.runtimeArgs {
  305. args = append(args, "--runtime-args")
  306. args = append(args, v)
  307. }
  308. logrus.Debugf("runContainerdDaemon: runtimeArgs: %s", args)
  309. }
  310. cmd := exec.Command(containerdBinary, args...)
  311. // redirect containerd logs to docker logs
  312. cmd.Stdout = os.Stdout
  313. cmd.Stderr = os.Stderr
  314. cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true}
  315. if err := cmd.Start(); err != nil {
  316. return err
  317. }
  318. logrus.Infof("New containerd process, pid: %d\n", cmd.Process.Pid)
  319. if _, err := f.WriteString(fmt.Sprintf("%d", cmd.Process.Pid)); err != nil {
  320. utils.KillProcess(cmd.Process.Pid)
  321. return err
  322. }
  323. go cmd.Wait() // Reap our child when needed
  324. r.daemonPid = cmd.Process.Pid
  325. return nil
  326. }
  327. // WithRemoteAddr sets the external containerd socket to connect to.
  328. func WithRemoteAddr(addr string) RemoteOption {
  329. return rpcAddr(addr)
  330. }
  331. type rpcAddr string
  332. func (a rpcAddr) Apply(r Remote) error {
  333. if remote, ok := r.(*remote); ok {
  334. remote.rpcAddr = string(a)
  335. return nil
  336. }
  337. return fmt.Errorf("WithRemoteAddr option not supported for this remote")
  338. }
  339. // WithRuntimeArgs sets the list of runtime args passed to containerd
  340. func WithRuntimeArgs(args []string) RemoteOption {
  341. return runtimeArgs(args)
  342. }
  343. type runtimeArgs []string
  344. func (rt runtimeArgs) Apply(r Remote) error {
  345. if remote, ok := r.(*remote); ok {
  346. remote.runtimeArgs = rt
  347. return nil
  348. }
  349. return fmt.Errorf("WithRuntimeArgs option not supported for this remote")
  350. }
  351. // WithStartDaemon defines if libcontainerd should also run containerd daemon.
  352. func WithStartDaemon(start bool) RemoteOption {
  353. return startDaemon(start)
  354. }
  355. type startDaemon bool
  356. func (s startDaemon) Apply(r Remote) error {
  357. if remote, ok := r.(*remote); ok {
  358. remote.startDaemon = bool(s)
  359. return nil
  360. }
  361. return fmt.Errorf("WithStartDaemon option not supported for this remote")
  362. }
  363. // WithDebugLog defines if containerd debug logs will be enabled for daemon.
  364. func WithDebugLog(debug bool) RemoteOption {
  365. return debugLog(debug)
  366. }
  367. type debugLog bool
  368. func (d debugLog) Apply(r Remote) error {
  369. if remote, ok := r.(*remote); ok {
  370. remote.debugLog = bool(d)
  371. return nil
  372. }
  373. return fmt.Errorf("WithDebugLog option not supported for this remote")
  374. }