remote_linux.go 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. package libcontainerd
  2. import (
  3. "fmt"
  4. "io"
  5. "io/ioutil"
  6. "log"
  7. "net"
  8. "os"
  9. "os/exec"
  10. "path/filepath"
  11. "strconv"
  12. "sync"
  13. "syscall"
  14. "time"
  15. "github.com/Sirupsen/logrus"
  16. containerd "github.com/docker/containerd/api/grpc/types"
  17. sysinfo "github.com/docker/docker/pkg/system"
  18. "github.com/docker/docker/utils"
  19. "golang.org/x/net/context"
  20. "google.golang.org/grpc"
  21. "google.golang.org/grpc/grpclog"
  22. )
  23. const (
  24. maxConnectionRetryCount = 3
  25. connectionRetryDelay = 3 * time.Second
  26. containerdShutdownTimeout = 15 * time.Second
  27. containerdBinary = "docker-containerd"
  28. containerdPidFilename = "docker-containerd.pid"
  29. containerdSockFilename = "docker-containerd.sock"
  30. eventTimestampFilename = "event.ts"
  31. )
  32. type remote struct {
  33. sync.RWMutex
  34. apiClient containerd.APIClient
  35. daemonPid int
  36. stateDir string
  37. rpcAddr string
  38. startDaemon bool
  39. debugLog bool
  40. rpcConn *grpc.ClientConn
  41. clients []*client
  42. eventTsPath string
  43. pastEvents map[string]*containerd.Event
  44. }
  45. // New creates a fresh instance of libcontainerd remote.
  46. func New(stateDir string, options ...RemoteOption) (_ Remote, err error) {
  47. defer func() {
  48. if err != nil {
  49. err = fmt.Errorf("Failed to connect to containerd. Please make sure containerd is installed in your PATH or you have specificed the correct address. Got error: %v", err)
  50. }
  51. }()
  52. r := &remote{
  53. stateDir: stateDir,
  54. daemonPid: -1,
  55. eventTsPath: filepath.Join(stateDir, eventTimestampFilename),
  56. pastEvents: make(map[string]*containerd.Event),
  57. }
  58. for _, option := range options {
  59. if err := option.Apply(r); err != nil {
  60. return nil, err
  61. }
  62. }
  63. if err := sysinfo.MkdirAll(stateDir, 0700); err != nil {
  64. return nil, err
  65. }
  66. if r.rpcAddr == "" {
  67. r.rpcAddr = filepath.Join(stateDir, containerdSockFilename)
  68. }
  69. if r.startDaemon {
  70. if err := r.runContainerdDaemon(); err != nil {
  71. return nil, err
  72. }
  73. }
  74. // don't output the grpc reconnect logging
  75. grpclog.SetLogger(log.New(ioutil.Discard, "", log.LstdFlags))
  76. dialOpts := append([]grpc.DialOption{grpc.WithInsecure()},
  77. grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
  78. return net.DialTimeout("unix", addr, timeout)
  79. }),
  80. )
  81. conn, err := grpc.Dial(r.rpcAddr, dialOpts...)
  82. if err != nil {
  83. return nil, fmt.Errorf("error connecting to containerd: %v", err)
  84. }
  85. r.rpcConn = conn
  86. r.apiClient = containerd.NewAPIClient(conn)
  87. go r.handleConnectionChange()
  88. if err := r.startEventsMonitor(); err != nil {
  89. return nil, err
  90. }
  91. return r, nil
  92. }
  93. func (r *remote) handleConnectionChange() {
  94. var transientFailureCount = 0
  95. state := grpc.Idle
  96. for {
  97. s, err := r.rpcConn.WaitForStateChange(context.Background(), state)
  98. if err != nil {
  99. break
  100. }
  101. state = s
  102. logrus.Debugf("containerd connection state change: %v", s)
  103. if r.daemonPid != -1 {
  104. switch state {
  105. case grpc.TransientFailure:
  106. // Reset state to be notified of next failure
  107. transientFailureCount++
  108. if transientFailureCount >= maxConnectionRetryCount {
  109. transientFailureCount = 0
  110. if utils.IsProcessAlive(r.daemonPid) {
  111. utils.KillProcess(r.daemonPid)
  112. }
  113. if err := r.runContainerdDaemon(); err != nil { //FIXME: Handle error
  114. logrus.Errorf("error restarting containerd: %v", err)
  115. }
  116. } else {
  117. state = grpc.Idle
  118. time.Sleep(connectionRetryDelay)
  119. }
  120. case grpc.Shutdown:
  121. // Well, we asked for it to stop, just return
  122. return
  123. }
  124. }
  125. }
  126. }
  127. func (r *remote) Cleanup() {
  128. if r.daemonPid == -1 {
  129. return
  130. }
  131. r.rpcConn.Close()
  132. // Ask the daemon to quit
  133. syscall.Kill(r.daemonPid, syscall.SIGTERM)
  134. // Wait up to 15secs for it to stop
  135. for i := time.Duration(0); i < containerdShutdownTimeout; i += time.Second {
  136. if !utils.IsProcessAlive(r.daemonPid) {
  137. break
  138. }
  139. time.Sleep(time.Second)
  140. }
  141. if utils.IsProcessAlive(r.daemonPid) {
  142. logrus.Warnf("libcontainerd: containerd (%d) didn't stop within 15 secs, killing it\n", r.daemonPid)
  143. syscall.Kill(r.daemonPid, syscall.SIGKILL)
  144. }
  145. // cleanup some files
  146. os.Remove(filepath.Join(r.stateDir, containerdPidFilename))
  147. os.Remove(filepath.Join(r.stateDir, containerdSockFilename))
  148. }
  149. func (r *remote) Client(b Backend) (Client, error) {
  150. c := &client{
  151. clientCommon: clientCommon{
  152. backend: b,
  153. containerMutexes: make(map[string]*sync.Mutex),
  154. containers: make(map[string]*container),
  155. },
  156. remote: r,
  157. exitNotifiers: make(map[string]*exitNotifier),
  158. }
  159. r.Lock()
  160. r.clients = append(r.clients, c)
  161. r.Unlock()
  162. return c, nil
  163. }
  164. func (r *remote) updateEventTimestamp(t time.Time) {
  165. f, err := os.OpenFile(r.eventTsPath, syscall.O_CREAT|syscall.O_WRONLY|syscall.O_TRUNC, 0600)
  166. defer f.Close()
  167. if err != nil {
  168. logrus.Warnf("libcontainerd: failed to open event timestamp file: %v", err)
  169. return
  170. }
  171. b, err := t.MarshalText()
  172. if err != nil {
  173. logrus.Warnf("libcontainerd: failed to encode timestamp: %v", err)
  174. return
  175. }
  176. n, err := f.Write(b)
  177. if err != nil || n != len(b) {
  178. logrus.Warnf("libcontainerd: failed to update event timestamp file: %v", err)
  179. f.Truncate(0)
  180. return
  181. }
  182. }
  183. func (r *remote) getLastEventTimestamp() int64 {
  184. t := time.Now()
  185. fi, err := os.Stat(r.eventTsPath)
  186. if os.IsNotExist(err) || fi.Size() == 0 {
  187. return t.Unix()
  188. }
  189. f, err := os.Open(r.eventTsPath)
  190. defer f.Close()
  191. if err != nil {
  192. logrus.Warn("libcontainerd: Unable to access last event ts: %v", err)
  193. return t.Unix()
  194. }
  195. b := make([]byte, fi.Size())
  196. n, err := f.Read(b)
  197. if err != nil || n != len(b) {
  198. logrus.Warn("libcontainerd: Unable to read last event ts: %v", err)
  199. return t.Unix()
  200. }
  201. t.UnmarshalText(b)
  202. return t.Unix()
  203. }
  204. func (r *remote) startEventsMonitor() error {
  205. // First, get past events
  206. er := &containerd.EventsRequest{
  207. Timestamp: uint64(r.getLastEventTimestamp()),
  208. }
  209. events, err := r.apiClient.Events(context.Background(), er)
  210. if err != nil {
  211. return err
  212. }
  213. go r.handleEventStream(events)
  214. return nil
  215. }
  216. func (r *remote) handleEventStream(events containerd.API_EventsClient) {
  217. live := false
  218. for {
  219. e, err := events.Recv()
  220. if err != nil {
  221. logrus.Errorf("failed to receive event from containerd: %v", err)
  222. go r.startEventsMonitor()
  223. return
  224. }
  225. if live == false {
  226. logrus.Debugf("received past containerd event: %#v", e)
  227. // Pause/Resume events should never happens after exit one
  228. switch e.Type {
  229. case StateExit:
  230. r.pastEvents[e.Id] = e
  231. case StatePause:
  232. r.pastEvents[e.Id] = e
  233. case StateResume:
  234. r.pastEvents[e.Id] = e
  235. case stateLive:
  236. live = true
  237. r.updateEventTimestamp(time.Unix(int64(e.Timestamp), 0))
  238. }
  239. } else {
  240. logrus.Debugf("received containerd event: %#v", e)
  241. var container *container
  242. var c *client
  243. r.RLock()
  244. for _, c = range r.clients {
  245. container, err = c.getContainer(e.Id)
  246. if err == nil {
  247. break
  248. }
  249. }
  250. r.RUnlock()
  251. if container == nil {
  252. logrus.Errorf("no state for container: %q", err)
  253. continue
  254. }
  255. if err := container.handleEvent(e); err != nil {
  256. logrus.Errorf("error processing state change for %s: %v", e.Id, err)
  257. }
  258. r.updateEventTimestamp(time.Unix(int64(e.Timestamp), 0))
  259. }
  260. }
  261. }
  262. func (r *remote) runContainerdDaemon() error {
  263. pidFilename := filepath.Join(r.stateDir, containerdPidFilename)
  264. f, err := os.OpenFile(pidFilename, os.O_RDWR|os.O_CREATE, 0600)
  265. defer f.Close()
  266. if err != nil {
  267. return err
  268. }
  269. // File exist, check if the daemon is alive
  270. b := make([]byte, 8)
  271. n, err := f.Read(b)
  272. if err != nil && err != io.EOF {
  273. return err
  274. }
  275. if n > 0 {
  276. pid, err := strconv.ParseUint(string(b[:n]), 10, 64)
  277. if err != nil {
  278. return err
  279. }
  280. if utils.IsProcessAlive(int(pid)) {
  281. logrus.Infof("previous instance of containerd still alive (%d)", pid)
  282. r.daemonPid = int(pid)
  283. return nil
  284. }
  285. }
  286. // rewind the file
  287. _, err = f.Seek(0, os.SEEK_SET)
  288. if err != nil {
  289. return err
  290. }
  291. // Truncate it
  292. err = f.Truncate(0)
  293. if err != nil {
  294. return err
  295. }
  296. // Start a new instance
  297. args := []string{"-l", r.rpcAddr, "--runtime", "docker-runc"}
  298. if r.debugLog {
  299. args = append(args, "--debug", "true")
  300. }
  301. cmd := exec.Command(containerdBinary, args...)
  302. // TODO: store logs?
  303. cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true}
  304. if err := cmd.Start(); err != nil {
  305. return err
  306. }
  307. logrus.Infof("New containerd process, pid: %d\n", cmd.Process.Pid)
  308. if _, err := f.WriteString(fmt.Sprintf("%d", cmd.Process.Pid)); err != nil {
  309. utils.KillProcess(cmd.Process.Pid)
  310. return err
  311. }
  312. go cmd.Wait() // Reap our child when needed
  313. r.daemonPid = cmd.Process.Pid
  314. return nil
  315. }
  316. // WithRemoteAddr sets the external containerd socket to connect to.
  317. func WithRemoteAddr(addr string) RemoteOption {
  318. return rpcAddr(addr)
  319. }
  320. type rpcAddr string
  321. func (a rpcAddr) Apply(r Remote) error {
  322. if remote, ok := r.(*remote); ok {
  323. remote.rpcAddr = string(a)
  324. return nil
  325. }
  326. return fmt.Errorf("WithRemoteAddr option not supported for this remote")
  327. }
  328. // WithStartDaemon defines if libcontainerd should also run containerd daemon.
  329. func WithStartDaemon(start bool) RemoteOption {
  330. return startDaemon(start)
  331. }
  332. type startDaemon bool
  333. func (s startDaemon) Apply(r Remote) error {
  334. if remote, ok := r.(*remote); ok {
  335. remote.startDaemon = bool(s)
  336. return nil
  337. }
  338. return fmt.Errorf("WithStartDaemon option not supported for this remote")
  339. }
  340. // WithDebugLog defines if containerd debug logs will be enabled for daemon.
  341. func WithDebugLog(debug bool) RemoteOption {
  342. return debugLog(debug)
  343. }
  344. type debugLog bool
  345. func (d debugLog) Apply(r Remote) error {
  346. if remote, ok := r.(*remote); ok {
  347. remote.debugLog = bool(d)
  348. return nil
  349. }
  350. return fmt.Errorf("WithDebugLog option not supported for this remote")
  351. }