container.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508
  1. package container
  2. import (
  3. "errors"
  4. "fmt"
  5. "net"
  6. "strings"
  7. "time"
  8. "github.com/Sirupsen/logrus"
  9. clustertypes "github.com/docker/docker/daemon/cluster/provider"
  10. "github.com/docker/docker/reference"
  11. "github.com/docker/engine-api/types"
  12. enginecontainer "github.com/docker/engine-api/types/container"
  13. "github.com/docker/engine-api/types/events"
  14. "github.com/docker/engine-api/types/filters"
  15. "github.com/docker/engine-api/types/network"
  16. "github.com/docker/swarmkit/agent/exec"
  17. "github.com/docker/swarmkit/api"
  18. )
  19. const (
  20. // Explicitly use the kernel's default setting for CPU quota of 100ms.
  21. // https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt
  22. cpuQuotaPeriod = 100 * time.Millisecond
  23. // systemLabelPrefix represents the reserved namespace for system labels.
  24. systemLabelPrefix = "com.docker.swarm"
  25. )
  26. // containerConfig converts task properties into docker container compatible
  27. // components.
  28. type containerConfig struct {
  29. task *api.Task
  30. networksAttachments map[string]*api.NetworkAttachment
  31. }
  32. // newContainerConfig returns a validated container config. No methods should
  33. // return an error if this function returns without error.
  34. func newContainerConfig(t *api.Task) (*containerConfig, error) {
  35. var c containerConfig
  36. return &c, c.setTask(t)
  37. }
  38. func (c *containerConfig) setTask(t *api.Task) error {
  39. container := t.Spec.GetContainer()
  40. if container == nil {
  41. return exec.ErrRuntimeUnsupported
  42. }
  43. if container.Image == "" {
  44. return ErrImageRequired
  45. }
  46. if err := validateMounts(container.Mounts); err != nil {
  47. return err
  48. }
  49. // index the networks by name
  50. c.networksAttachments = make(map[string]*api.NetworkAttachment, len(t.Networks))
  51. for _, attachment := range t.Networks {
  52. c.networksAttachments[attachment.Network.Spec.Annotations.Name] = attachment
  53. }
  54. c.task = t
  55. return nil
  56. }
  57. func (c *containerConfig) endpoint() *api.Endpoint {
  58. return c.task.Endpoint
  59. }
  60. func (c *containerConfig) spec() *api.ContainerSpec {
  61. return c.task.Spec.GetContainer()
  62. }
  63. func (c *containerConfig) name() string {
  64. if c.task.Annotations.Name != "" {
  65. // if set, use the container Annotations.Name field, set in the orchestrator.
  66. return c.task.Annotations.Name
  67. }
  68. // fallback to service.slot.id.
  69. return strings.Join([]string{c.task.ServiceAnnotations.Name, fmt.Sprint(c.task.Slot), c.task.ID}, ".")
  70. }
  71. func (c *containerConfig) image() string {
  72. raw := c.spec().Image
  73. ref, err := reference.ParseNamed(raw)
  74. if err != nil {
  75. return raw
  76. }
  77. return reference.WithDefaultTag(ref).String()
  78. }
  79. func (c *containerConfig) config() *enginecontainer.Config {
  80. config := &enginecontainer.Config{
  81. Labels: c.labels(),
  82. User: c.spec().User,
  83. Env: c.spec().Env,
  84. WorkingDir: c.spec().Dir,
  85. Image: c.image(),
  86. Volumes: c.volumes(),
  87. }
  88. if len(c.spec().Command) > 0 {
  89. // If Command is provided, we replace the whole invocation with Command
  90. // by replacing Entrypoint and specifying Cmd. Args is ignored in this
  91. // case.
  92. config.Entrypoint = append(config.Entrypoint, c.spec().Command...)
  93. config.Cmd = append(config.Cmd, c.spec().Args...)
  94. } else if len(c.spec().Args) > 0 {
  95. // In this case, we assume the image has an Entrypoint and Args
  96. // specifies the arguments for that entrypoint.
  97. config.Cmd = c.spec().Args
  98. }
  99. return config
  100. }
  101. func (c *containerConfig) labels() map[string]string {
  102. var (
  103. system = map[string]string{
  104. "task": "", // mark as cluster task
  105. "task.id": c.task.ID,
  106. "task.name": fmt.Sprintf("%v.%v", c.task.ServiceAnnotations.Name, c.task.Slot),
  107. "node.id": c.task.NodeID,
  108. "service.id": c.task.ServiceID,
  109. "service.name": c.task.ServiceAnnotations.Name,
  110. }
  111. labels = make(map[string]string)
  112. )
  113. // base labels are those defined in the spec.
  114. for k, v := range c.spec().Labels {
  115. labels[k] = v
  116. }
  117. // we then apply the overrides from the task, which may be set via the
  118. // orchestrator.
  119. for k, v := range c.task.Annotations.Labels {
  120. labels[k] = v
  121. }
  122. // finally, we apply the system labels, which override all labels.
  123. for k, v := range system {
  124. labels[strings.Join([]string{systemLabelPrefix, k}, ".")] = v
  125. }
  126. return labels
  127. }
  128. // volumes gets placed into the Volumes field on the containerConfig.
  129. func (c *containerConfig) volumes() map[string]struct{} {
  130. r := make(map[string]struct{})
  131. // Volumes *only* creates anonymous volumes. The rest is mixed in with
  132. // binds, which aren't actually binds. Basically, any volume that
  133. // results in a single component must be added here.
  134. //
  135. // This is reversed engineered from the behavior of the engine API.
  136. for _, mount := range c.spec().Mounts {
  137. if mount.Type == api.MountTypeVolume && mount.Source == "" {
  138. r[mount.Target] = struct{}{}
  139. }
  140. }
  141. return r
  142. }
  143. func (c *containerConfig) tmpfs() map[string]string {
  144. r := make(map[string]string)
  145. for _, spec := range c.spec().Mounts {
  146. if spec.Type != api.MountTypeTmpfs {
  147. continue
  148. }
  149. r[spec.Target] = getMountMask(&spec)
  150. }
  151. return r
  152. }
  153. func (c *containerConfig) binds() []string {
  154. var r []string
  155. for _, mount := range c.spec().Mounts {
  156. if mount.Type == api.MountTypeBind || (mount.Type == api.MountTypeVolume && mount.Source != "") {
  157. spec := fmt.Sprintf("%s:%s", mount.Source, mount.Target)
  158. mask := getMountMask(&mount)
  159. if mask != "" {
  160. spec = fmt.Sprintf("%s:%s", spec, mask)
  161. }
  162. r = append(r, spec)
  163. }
  164. }
  165. return r
  166. }
  167. func getMountMask(m *api.Mount) string {
  168. var maskOpts []string
  169. if m.ReadOnly {
  170. maskOpts = append(maskOpts, "ro")
  171. }
  172. switch m.Type {
  173. case api.MountTypeVolume:
  174. if m.VolumeOptions != nil && m.VolumeOptions.NoCopy {
  175. maskOpts = append(maskOpts, "nocopy")
  176. }
  177. case api.MountTypeBind:
  178. if m.BindOptions == nil {
  179. break
  180. }
  181. switch m.BindOptions.Propagation {
  182. case api.MountPropagationPrivate:
  183. maskOpts = append(maskOpts, "private")
  184. case api.MountPropagationRPrivate:
  185. maskOpts = append(maskOpts, "rprivate")
  186. case api.MountPropagationShared:
  187. maskOpts = append(maskOpts, "shared")
  188. case api.MountPropagationRShared:
  189. maskOpts = append(maskOpts, "rshared")
  190. case api.MountPropagationSlave:
  191. maskOpts = append(maskOpts, "slave")
  192. case api.MountPropagationRSlave:
  193. maskOpts = append(maskOpts, "rslave")
  194. }
  195. case api.MountTypeTmpfs:
  196. if m.TmpfsOptions == nil {
  197. break
  198. }
  199. if m.TmpfsOptions.Mode != 0 {
  200. maskOpts = append(maskOpts, fmt.Sprintf("mode=%o", m.TmpfsOptions.Mode))
  201. }
  202. if m.TmpfsOptions.SizeBytes != 0 {
  203. // calculate suffix here, making this linux specific, but that is
  204. // okay, since API is that way anyways.
  205. // we do this by finding the suffix that divides evenly into the
  206. // value, returing the value itself, with no suffix, if it fails.
  207. //
  208. // For the most part, we don't enforce any semantic to this values.
  209. // The operating system will usually align this and enforce minimum
  210. // and maximums.
  211. var (
  212. size = m.TmpfsOptions.SizeBytes
  213. suffix string
  214. )
  215. for _, r := range []struct {
  216. suffix string
  217. divisor int64
  218. }{
  219. {"g", 1 << 30},
  220. {"m", 1 << 20},
  221. {"k", 1 << 10},
  222. } {
  223. if size%r.divisor == 0 {
  224. size = size / r.divisor
  225. suffix = r.suffix
  226. break
  227. }
  228. }
  229. maskOpts = append(maskOpts, fmt.Sprintf("size=%d%s", size, suffix))
  230. }
  231. }
  232. return strings.Join(maskOpts, ",")
  233. }
  234. func (c *containerConfig) hostConfig() *enginecontainer.HostConfig {
  235. hc := &enginecontainer.HostConfig{
  236. Resources: c.resources(),
  237. Binds: c.binds(),
  238. Tmpfs: c.tmpfs(),
  239. }
  240. if c.task.LogDriver != nil {
  241. hc.LogConfig = enginecontainer.LogConfig{
  242. Type: c.task.LogDriver.Name,
  243. Config: c.task.LogDriver.Options,
  244. }
  245. }
  246. return hc
  247. }
  248. // This handles the case of volumes that are defined inside a service Mount
  249. func (c *containerConfig) volumeCreateRequest(mount *api.Mount) *types.VolumeCreateRequest {
  250. var (
  251. driverName string
  252. driverOpts map[string]string
  253. labels map[string]string
  254. )
  255. if mount.VolumeOptions != nil && mount.VolumeOptions.DriverConfig != nil {
  256. driverName = mount.VolumeOptions.DriverConfig.Name
  257. driverOpts = mount.VolumeOptions.DriverConfig.Options
  258. labels = mount.VolumeOptions.Labels
  259. }
  260. if mount.VolumeOptions != nil {
  261. return &types.VolumeCreateRequest{
  262. Name: mount.Source,
  263. Driver: driverName,
  264. DriverOpts: driverOpts,
  265. Labels: labels,
  266. }
  267. }
  268. return nil
  269. }
  270. func (c *containerConfig) resources() enginecontainer.Resources {
  271. resources := enginecontainer.Resources{}
  272. // If no limits are specified let the engine use its defaults.
  273. //
  274. // TODO(aluzzardi): We might want to set some limits anyway otherwise
  275. // "unlimited" tasks will step over the reservation of other tasks.
  276. r := c.task.Spec.Resources
  277. if r == nil || r.Limits == nil {
  278. return resources
  279. }
  280. if r.Limits.MemoryBytes > 0 {
  281. resources.Memory = r.Limits.MemoryBytes
  282. }
  283. if r.Limits.NanoCPUs > 0 {
  284. // CPU Period must be set in microseconds.
  285. resources.CPUPeriod = int64(cpuQuotaPeriod / time.Microsecond)
  286. resources.CPUQuota = r.Limits.NanoCPUs * resources.CPUPeriod / 1e9
  287. }
  288. return resources
  289. }
  290. // Docker daemon supports just 1 network during container create.
  291. func (c *containerConfig) createNetworkingConfig() *network.NetworkingConfig {
  292. var networks []*api.NetworkAttachment
  293. if c.task.Spec.GetContainer() != nil {
  294. networks = c.task.Networks
  295. }
  296. epConfig := make(map[string]*network.EndpointSettings)
  297. if len(networks) > 0 {
  298. epConfig[networks[0].Network.Spec.Annotations.Name] = getEndpointConfig(networks[0])
  299. }
  300. return &network.NetworkingConfig{EndpointsConfig: epConfig}
  301. }
  302. // TODO: Merge this function with createNetworkingConfig after daemon supports multiple networks in container create
  303. func (c *containerConfig) connectNetworkingConfig() *network.NetworkingConfig {
  304. var networks []*api.NetworkAttachment
  305. if c.task.Spec.GetContainer() != nil {
  306. networks = c.task.Networks
  307. }
  308. // First network is used during container create. Other networks are used in "docker network connect"
  309. if len(networks) < 2 {
  310. return nil
  311. }
  312. epConfig := make(map[string]*network.EndpointSettings)
  313. for _, na := range networks[1:] {
  314. epConfig[na.Network.Spec.Annotations.Name] = getEndpointConfig(na)
  315. }
  316. return &network.NetworkingConfig{EndpointsConfig: epConfig}
  317. }
  318. func getEndpointConfig(na *api.NetworkAttachment) *network.EndpointSettings {
  319. var ipv4, ipv6 string
  320. for _, addr := range na.Addresses {
  321. ip, _, err := net.ParseCIDR(addr)
  322. if err != nil {
  323. continue
  324. }
  325. if ip.To4() != nil {
  326. ipv4 = ip.String()
  327. continue
  328. }
  329. if ip.To16() != nil {
  330. ipv6 = ip.String()
  331. }
  332. }
  333. return &network.EndpointSettings{
  334. IPAMConfig: &network.EndpointIPAMConfig{
  335. IPv4Address: ipv4,
  336. IPv6Address: ipv6,
  337. },
  338. }
  339. }
  340. func (c *containerConfig) virtualIP(networkID string) string {
  341. if c.task.Endpoint == nil {
  342. return ""
  343. }
  344. for _, eVip := range c.task.Endpoint.VirtualIPs {
  345. // We only support IPv4 VIPs for now.
  346. if eVip.NetworkID == networkID {
  347. vip, _, err := net.ParseCIDR(eVip.Addr)
  348. if err != nil {
  349. return ""
  350. }
  351. return vip.String()
  352. }
  353. }
  354. return ""
  355. }
  356. func (c *containerConfig) serviceConfig() *clustertypes.ServiceConfig {
  357. if len(c.task.Networks) == 0 {
  358. return nil
  359. }
  360. logrus.Debugf("Creating service config in agent for t = %+v", c.task)
  361. svcCfg := &clustertypes.ServiceConfig{
  362. Name: c.task.ServiceAnnotations.Name,
  363. Aliases: make(map[string][]string),
  364. ID: c.task.ServiceID,
  365. VirtualAddresses: make(map[string]*clustertypes.VirtualAddress),
  366. }
  367. for _, na := range c.task.Networks {
  368. svcCfg.VirtualAddresses[na.Network.ID] = &clustertypes.VirtualAddress{
  369. // We support only IPv4 virtual IP for now.
  370. IPv4: c.virtualIP(na.Network.ID),
  371. }
  372. if len(na.Aliases) > 0 {
  373. svcCfg.Aliases[na.Network.ID] = na.Aliases
  374. }
  375. }
  376. if c.task.Endpoint != nil {
  377. for _, ePort := range c.task.Endpoint.Ports {
  378. svcCfg.ExposedPorts = append(svcCfg.ExposedPorts, &clustertypes.PortConfig{
  379. Name: ePort.Name,
  380. Protocol: int32(ePort.Protocol),
  381. TargetPort: ePort.TargetPort,
  382. PublishedPort: ePort.PublishedPort,
  383. })
  384. }
  385. }
  386. return svcCfg
  387. }
  388. // networks returns a list of network names attached to the container. The
  389. // returned name can be used to lookup the corresponding network create
  390. // options.
  391. func (c *containerConfig) networks() []string {
  392. var networks []string
  393. for name := range c.networksAttachments {
  394. networks = append(networks, name)
  395. }
  396. return networks
  397. }
  398. func (c *containerConfig) networkCreateRequest(name string) (clustertypes.NetworkCreateRequest, error) {
  399. na, ok := c.networksAttachments[name]
  400. if !ok {
  401. return clustertypes.NetworkCreateRequest{}, errors.New("container: unknown network referenced")
  402. }
  403. options := types.NetworkCreate{
  404. // ID: na.Network.ID,
  405. Driver: na.Network.DriverState.Name,
  406. IPAM: network.IPAM{
  407. Driver: na.Network.IPAM.Driver.Name,
  408. },
  409. Options: na.Network.DriverState.Options,
  410. Labels: na.Network.Spec.Annotations.Labels,
  411. Internal: na.Network.Spec.Internal,
  412. EnableIPv6: na.Network.Spec.Ipv6Enabled,
  413. CheckDuplicate: true,
  414. }
  415. for _, ic := range na.Network.IPAM.Configs {
  416. c := network.IPAMConfig{
  417. Subnet: ic.Subnet,
  418. IPRange: ic.Range,
  419. Gateway: ic.Gateway,
  420. }
  421. options.IPAM.Config = append(options.IPAM.Config, c)
  422. }
  423. return clustertypes.NetworkCreateRequest{na.Network.ID, types.NetworkCreateRequest{Name: name, NetworkCreate: options}}, nil
  424. }
  425. func (c containerConfig) eventFilter() filters.Args {
  426. filter := filters.NewArgs()
  427. filter.Add("type", events.ContainerEventType)
  428. filter.Add("name", c.name())
  429. filter.Add("label", fmt.Sprintf("%v.task.id=%v", systemLabelPrefix, c.task.ID))
  430. return filter
  431. }