container.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642
  1. package container
  2. import (
  3. "errors"
  4. "fmt"
  5. "net"
  6. "strconv"
  7. "strings"
  8. "time"
  9. "github.com/Sirupsen/logrus"
  10. "github.com/docker/docker/api/types"
  11. enginecontainer "github.com/docker/docker/api/types/container"
  12. "github.com/docker/docker/api/types/events"
  13. "github.com/docker/docker/api/types/filters"
  14. "github.com/docker/docker/api/types/network"
  15. volumetypes "github.com/docker/docker/api/types/volume"
  16. clustertypes "github.com/docker/docker/daemon/cluster/provider"
  17. "github.com/docker/docker/reference"
  18. "github.com/docker/go-connections/nat"
  19. "github.com/docker/swarmkit/agent/exec"
  20. "github.com/docker/swarmkit/api"
  21. "github.com/docker/swarmkit/protobuf/ptypes"
  22. "github.com/docker/swarmkit/template"
  23. )
  24. const (
  25. // Explicitly use the kernel's default setting for CPU quota of 100ms.
  26. // https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt
  27. cpuQuotaPeriod = 100 * time.Millisecond
  28. // systemLabelPrefix represents the reserved namespace for system labels.
  29. systemLabelPrefix = "com.docker.swarm"
  30. )
  31. // containerConfig converts task properties into docker container compatible
  32. // components.
  33. type containerConfig struct {
  34. task *api.Task
  35. networksAttachments map[string]*api.NetworkAttachment
  36. }
  37. // newContainerConfig returns a validated container config. No methods should
  38. // return an error if this function returns without error.
  39. func newContainerConfig(t *api.Task) (*containerConfig, error) {
  40. var c containerConfig
  41. return &c, c.setTask(t)
  42. }
  43. func (c *containerConfig) setTask(t *api.Task) error {
  44. if t.Spec.GetContainer() == nil && t.Spec.GetAttachment() == nil {
  45. return exec.ErrRuntimeUnsupported
  46. }
  47. container := t.Spec.GetContainer()
  48. if container != nil {
  49. if container.Image == "" {
  50. return ErrImageRequired
  51. }
  52. if err := validateMounts(container.Mounts); err != nil {
  53. return err
  54. }
  55. }
  56. // index the networks by name
  57. c.networksAttachments = make(map[string]*api.NetworkAttachment, len(t.Networks))
  58. for _, attachment := range t.Networks {
  59. c.networksAttachments[attachment.Network.Spec.Annotations.Name] = attachment
  60. }
  61. c.task = t
  62. if t.Spec.GetContainer() != nil {
  63. preparedSpec, err := template.ExpandContainerSpec(t)
  64. if err != nil {
  65. return err
  66. }
  67. c.task.Spec.Runtime = &api.TaskSpec_Container{
  68. Container: preparedSpec,
  69. }
  70. }
  71. return nil
  72. }
  73. func (c *containerConfig) id() string {
  74. attachment := c.task.Spec.GetAttachment()
  75. if attachment == nil {
  76. return ""
  77. }
  78. return attachment.ContainerID
  79. }
  80. func (c *containerConfig) taskID() string {
  81. return c.task.ID
  82. }
  83. func (c *containerConfig) endpoint() *api.Endpoint {
  84. return c.task.Endpoint
  85. }
  86. func (c *containerConfig) spec() *api.ContainerSpec {
  87. return c.task.Spec.GetContainer()
  88. }
  89. func (c *containerConfig) nameOrID() string {
  90. if c.task.Spec.GetContainer() != nil {
  91. return c.name()
  92. }
  93. return c.id()
  94. }
  95. func (c *containerConfig) name() string {
  96. if c.task.Annotations.Name != "" {
  97. // if set, use the container Annotations.Name field, set in the orchestrator.
  98. return c.task.Annotations.Name
  99. }
  100. slot := fmt.Sprint(c.task.Slot)
  101. if slot == "" || c.task.Slot == 0 {
  102. slot = c.task.NodeID
  103. }
  104. // fallback to service.slot.id.
  105. return fmt.Sprintf("%s.%s.%s", c.task.ServiceAnnotations.Name, slot, c.task.ID)
  106. }
  107. func (c *containerConfig) image() string {
  108. raw := c.spec().Image
  109. ref, err := reference.ParseNamed(raw)
  110. if err != nil {
  111. return raw
  112. }
  113. return reference.WithDefaultTag(ref).String()
  114. }
  115. func (c *containerConfig) portBindings() nat.PortMap {
  116. portBindings := nat.PortMap{}
  117. if c.task.Endpoint == nil {
  118. return portBindings
  119. }
  120. for _, portConfig := range c.task.Endpoint.Ports {
  121. if portConfig.PublishMode != api.PublishModeHost {
  122. continue
  123. }
  124. port := nat.Port(fmt.Sprintf("%d/%s", portConfig.TargetPort, strings.ToLower(portConfig.Protocol.String())))
  125. binding := []nat.PortBinding{
  126. {},
  127. }
  128. if portConfig.PublishedPort != 0 {
  129. binding[0].HostPort = strconv.Itoa(int(portConfig.PublishedPort))
  130. }
  131. portBindings[port] = binding
  132. }
  133. return portBindings
  134. }
  135. func (c *containerConfig) exposedPorts() map[nat.Port]struct{} {
  136. exposedPorts := make(map[nat.Port]struct{})
  137. if c.task.Endpoint == nil {
  138. return exposedPorts
  139. }
  140. for _, portConfig := range c.task.Endpoint.Ports {
  141. if portConfig.PublishMode != api.PublishModeHost {
  142. continue
  143. }
  144. port := nat.Port(fmt.Sprintf("%d/%s", portConfig.TargetPort, strings.ToLower(portConfig.Protocol.String())))
  145. exposedPorts[port] = struct{}{}
  146. }
  147. return exposedPorts
  148. }
  149. func (c *containerConfig) config() *enginecontainer.Config {
  150. config := &enginecontainer.Config{
  151. Labels: c.labels(),
  152. Tty: c.spec().TTY,
  153. OpenStdin: c.spec().OpenStdin,
  154. User: c.spec().User,
  155. Env: c.spec().Env,
  156. Hostname: c.spec().Hostname,
  157. WorkingDir: c.spec().Dir,
  158. Image: c.image(),
  159. Volumes: c.volumes(),
  160. ExposedPorts: c.exposedPorts(),
  161. Healthcheck: c.healthcheck(),
  162. }
  163. if len(c.spec().Command) > 0 {
  164. // If Command is provided, we replace the whole invocation with Command
  165. // by replacing Entrypoint and specifying Cmd. Args is ignored in this
  166. // case.
  167. config.Entrypoint = append(config.Entrypoint, c.spec().Command...)
  168. config.Cmd = append(config.Cmd, c.spec().Args...)
  169. } else if len(c.spec().Args) > 0 {
  170. // In this case, we assume the image has an Entrypoint and Args
  171. // specifies the arguments for that entrypoint.
  172. config.Cmd = c.spec().Args
  173. }
  174. return config
  175. }
  176. func (c *containerConfig) labels() map[string]string {
  177. var (
  178. system = map[string]string{
  179. "task": "", // mark as cluster task
  180. "task.id": c.task.ID,
  181. "task.name": c.name(),
  182. "node.id": c.task.NodeID,
  183. "service.id": c.task.ServiceID,
  184. "service.name": c.task.ServiceAnnotations.Name,
  185. }
  186. labels = make(map[string]string)
  187. )
  188. // base labels are those defined in the spec.
  189. for k, v := range c.spec().Labels {
  190. labels[k] = v
  191. }
  192. // we then apply the overrides from the task, which may be set via the
  193. // orchestrator.
  194. for k, v := range c.task.Annotations.Labels {
  195. labels[k] = v
  196. }
  197. // finally, we apply the system labels, which override all labels.
  198. for k, v := range system {
  199. labels[strings.Join([]string{systemLabelPrefix, k}, ".")] = v
  200. }
  201. return labels
  202. }
  203. // volumes gets placed into the Volumes field on the containerConfig.
  204. func (c *containerConfig) volumes() map[string]struct{} {
  205. r := make(map[string]struct{})
  206. // Volumes *only* creates anonymous volumes. The rest is mixed in with
  207. // binds, which aren't actually binds. Basically, any volume that
  208. // results in a single component must be added here.
  209. //
  210. // This is reversed engineered from the behavior of the engine API.
  211. for _, mount := range c.spec().Mounts {
  212. if mount.Type == api.MountTypeVolume && mount.Source == "" {
  213. r[mount.Target] = struct{}{}
  214. }
  215. }
  216. return r
  217. }
  218. func (c *containerConfig) tmpfs() map[string]string {
  219. r := make(map[string]string)
  220. for _, spec := range c.spec().Mounts {
  221. if spec.Type != api.MountTypeTmpfs {
  222. continue
  223. }
  224. r[spec.Target] = getMountMask(&spec)
  225. }
  226. return r
  227. }
  228. func (c *containerConfig) binds() []string {
  229. var r []string
  230. for _, mount := range c.spec().Mounts {
  231. if mount.Type == api.MountTypeBind || (mount.Type == api.MountTypeVolume && mount.Source != "") {
  232. spec := fmt.Sprintf("%s:%s", mount.Source, mount.Target)
  233. mask := getMountMask(&mount)
  234. if mask != "" {
  235. spec = fmt.Sprintf("%s:%s", spec, mask)
  236. }
  237. r = append(r, spec)
  238. }
  239. }
  240. return r
  241. }
  242. func (c *containerConfig) healthcheck() *enginecontainer.HealthConfig {
  243. hcSpec := c.spec().Healthcheck
  244. if hcSpec == nil {
  245. return nil
  246. }
  247. interval, _ := ptypes.Duration(hcSpec.Interval)
  248. timeout, _ := ptypes.Duration(hcSpec.Timeout)
  249. return &enginecontainer.HealthConfig{
  250. Test: hcSpec.Test,
  251. Interval: interval,
  252. Timeout: timeout,
  253. Retries: int(hcSpec.Retries),
  254. }
  255. }
  256. func getMountMask(m *api.Mount) string {
  257. var maskOpts []string
  258. if m.ReadOnly {
  259. maskOpts = append(maskOpts, "ro")
  260. }
  261. switch m.Type {
  262. case api.MountTypeVolume:
  263. if m.VolumeOptions != nil && m.VolumeOptions.NoCopy {
  264. maskOpts = append(maskOpts, "nocopy")
  265. }
  266. case api.MountTypeBind:
  267. if m.BindOptions == nil {
  268. break
  269. }
  270. switch m.BindOptions.Propagation {
  271. case api.MountPropagationPrivate:
  272. maskOpts = append(maskOpts, "private")
  273. case api.MountPropagationRPrivate:
  274. maskOpts = append(maskOpts, "rprivate")
  275. case api.MountPropagationShared:
  276. maskOpts = append(maskOpts, "shared")
  277. case api.MountPropagationRShared:
  278. maskOpts = append(maskOpts, "rshared")
  279. case api.MountPropagationSlave:
  280. maskOpts = append(maskOpts, "slave")
  281. case api.MountPropagationRSlave:
  282. maskOpts = append(maskOpts, "rslave")
  283. }
  284. case api.MountTypeTmpfs:
  285. if m.TmpfsOptions == nil {
  286. break
  287. }
  288. if m.TmpfsOptions.Mode != 0 {
  289. maskOpts = append(maskOpts, fmt.Sprintf("mode=%o", m.TmpfsOptions.Mode))
  290. }
  291. if m.TmpfsOptions.SizeBytes != 0 {
  292. // calculate suffix here, making this linux specific, but that is
  293. // okay, since API is that way anyways.
  294. // we do this by finding the suffix that divides evenly into the
  295. // value, returing the value itself, with no suffix, if it fails.
  296. //
  297. // For the most part, we don't enforce any semantic to this values.
  298. // The operating system will usually align this and enforce minimum
  299. // and maximums.
  300. var (
  301. size = m.TmpfsOptions.SizeBytes
  302. suffix string
  303. )
  304. for _, r := range []struct {
  305. suffix string
  306. divisor int64
  307. }{
  308. {"g", 1 << 30},
  309. {"m", 1 << 20},
  310. {"k", 1 << 10},
  311. } {
  312. if size%r.divisor == 0 {
  313. size = size / r.divisor
  314. suffix = r.suffix
  315. break
  316. }
  317. }
  318. maskOpts = append(maskOpts, fmt.Sprintf("size=%d%s", size, suffix))
  319. }
  320. }
  321. return strings.Join(maskOpts, ",")
  322. }
  323. func (c *containerConfig) hostConfig() *enginecontainer.HostConfig {
  324. hc := &enginecontainer.HostConfig{
  325. Resources: c.resources(),
  326. Binds: c.binds(),
  327. Tmpfs: c.tmpfs(),
  328. GroupAdd: c.spec().Groups,
  329. PortBindings: c.portBindings(),
  330. }
  331. if c.spec().DNSConfig != nil {
  332. hc.DNS = c.spec().DNSConfig.Nameservers
  333. hc.DNSSearch = c.spec().DNSConfig.Search
  334. hc.DNSOptions = c.spec().DNSConfig.Options
  335. }
  336. // The format of extra hosts on swarmkit is specified in:
  337. // http://man7.org/linux/man-pages/man5/hosts.5.html
  338. // IP_address canonical_hostname [aliases...]
  339. // However, the format of ExtraHosts in HostConfig is
  340. // <host>:<ip>
  341. // We need to do the conversion here
  342. // (Alias is ignored for now)
  343. for _, entry := range c.spec().Hosts {
  344. parts := strings.Fields(entry)
  345. if len(parts) > 1 {
  346. hc.ExtraHosts = append(hc.ExtraHosts, fmt.Sprintf("%s:%s", parts[1], parts[0]))
  347. }
  348. }
  349. if c.task.LogDriver != nil {
  350. hc.LogConfig = enginecontainer.LogConfig{
  351. Type: c.task.LogDriver.Name,
  352. Config: c.task.LogDriver.Options,
  353. }
  354. }
  355. return hc
  356. }
  357. // This handles the case of volumes that are defined inside a service Mount
  358. func (c *containerConfig) volumeCreateRequest(mount *api.Mount) *volumetypes.VolumesCreateBody {
  359. var (
  360. driverName string
  361. driverOpts map[string]string
  362. labels map[string]string
  363. )
  364. if mount.VolumeOptions != nil && mount.VolumeOptions.DriverConfig != nil {
  365. driverName = mount.VolumeOptions.DriverConfig.Name
  366. driverOpts = mount.VolumeOptions.DriverConfig.Options
  367. labels = mount.VolumeOptions.Labels
  368. }
  369. if mount.VolumeOptions != nil {
  370. return &volumetypes.VolumesCreateBody{
  371. Name: mount.Source,
  372. Driver: driverName,
  373. DriverOpts: driverOpts,
  374. Labels: labels,
  375. }
  376. }
  377. return nil
  378. }
  379. func (c *containerConfig) resources() enginecontainer.Resources {
  380. resources := enginecontainer.Resources{}
  381. // If no limits are specified let the engine use its defaults.
  382. //
  383. // TODO(aluzzardi): We might want to set some limits anyway otherwise
  384. // "unlimited" tasks will step over the reservation of other tasks.
  385. r := c.task.Spec.Resources
  386. if r == nil || r.Limits == nil {
  387. return resources
  388. }
  389. if r.Limits.MemoryBytes > 0 {
  390. resources.Memory = r.Limits.MemoryBytes
  391. }
  392. if r.Limits.NanoCPUs > 0 {
  393. // CPU Period must be set in microseconds.
  394. resources.CPUPeriod = int64(cpuQuotaPeriod / time.Microsecond)
  395. resources.CPUQuota = r.Limits.NanoCPUs * resources.CPUPeriod / 1e9
  396. }
  397. return resources
  398. }
  399. // Docker daemon supports just 1 network during container create.
  400. func (c *containerConfig) createNetworkingConfig() *network.NetworkingConfig {
  401. var networks []*api.NetworkAttachment
  402. if c.task.Spec.GetContainer() != nil || c.task.Spec.GetAttachment() != nil {
  403. networks = c.task.Networks
  404. }
  405. epConfig := make(map[string]*network.EndpointSettings)
  406. if len(networks) > 0 {
  407. epConfig[networks[0].Network.Spec.Annotations.Name] = getEndpointConfig(networks[0])
  408. }
  409. return &network.NetworkingConfig{EndpointsConfig: epConfig}
  410. }
  411. // TODO: Merge this function with createNetworkingConfig after daemon supports multiple networks in container create
  412. func (c *containerConfig) connectNetworkingConfig() *network.NetworkingConfig {
  413. var networks []*api.NetworkAttachment
  414. if c.task.Spec.GetContainer() != nil {
  415. networks = c.task.Networks
  416. }
  417. // First network is used during container create. Other networks are used in "docker network connect"
  418. if len(networks) < 2 {
  419. return nil
  420. }
  421. epConfig := make(map[string]*network.EndpointSettings)
  422. for _, na := range networks[1:] {
  423. epConfig[na.Network.Spec.Annotations.Name] = getEndpointConfig(na)
  424. }
  425. return &network.NetworkingConfig{EndpointsConfig: epConfig}
  426. }
  427. func getEndpointConfig(na *api.NetworkAttachment) *network.EndpointSettings {
  428. var ipv4, ipv6 string
  429. for _, addr := range na.Addresses {
  430. ip, _, err := net.ParseCIDR(addr)
  431. if err != nil {
  432. continue
  433. }
  434. if ip.To4() != nil {
  435. ipv4 = ip.String()
  436. continue
  437. }
  438. if ip.To16() != nil {
  439. ipv6 = ip.String()
  440. }
  441. }
  442. return &network.EndpointSettings{
  443. NetworkID: na.Network.ID,
  444. IPAMConfig: &network.EndpointIPAMConfig{
  445. IPv4Address: ipv4,
  446. IPv6Address: ipv6,
  447. },
  448. }
  449. }
  450. func (c *containerConfig) virtualIP(networkID string) string {
  451. if c.task.Endpoint == nil {
  452. return ""
  453. }
  454. for _, eVip := range c.task.Endpoint.VirtualIPs {
  455. // We only support IPv4 VIPs for now.
  456. if eVip.NetworkID == networkID {
  457. vip, _, err := net.ParseCIDR(eVip.Addr)
  458. if err != nil {
  459. return ""
  460. }
  461. return vip.String()
  462. }
  463. }
  464. return ""
  465. }
  466. func (c *containerConfig) serviceConfig() *clustertypes.ServiceConfig {
  467. if len(c.task.Networks) == 0 {
  468. return nil
  469. }
  470. logrus.Debugf("Creating service config in agent for t = %+v", c.task)
  471. svcCfg := &clustertypes.ServiceConfig{
  472. Name: c.task.ServiceAnnotations.Name,
  473. Aliases: make(map[string][]string),
  474. ID: c.task.ServiceID,
  475. VirtualAddresses: make(map[string]*clustertypes.VirtualAddress),
  476. }
  477. for _, na := range c.task.Networks {
  478. svcCfg.VirtualAddresses[na.Network.ID] = &clustertypes.VirtualAddress{
  479. // We support only IPv4 virtual IP for now.
  480. IPv4: c.virtualIP(na.Network.ID),
  481. }
  482. if len(na.Aliases) > 0 {
  483. svcCfg.Aliases[na.Network.ID] = na.Aliases
  484. }
  485. }
  486. if c.task.Endpoint != nil {
  487. for _, ePort := range c.task.Endpoint.Ports {
  488. if ePort.PublishMode != api.PublishModeIngress {
  489. continue
  490. }
  491. svcCfg.ExposedPorts = append(svcCfg.ExposedPorts, &clustertypes.PortConfig{
  492. Name: ePort.Name,
  493. Protocol: int32(ePort.Protocol),
  494. TargetPort: ePort.TargetPort,
  495. PublishedPort: ePort.PublishedPort,
  496. })
  497. }
  498. }
  499. return svcCfg
  500. }
  501. // networks returns a list of network names attached to the container. The
  502. // returned name can be used to lookup the corresponding network create
  503. // options.
  504. func (c *containerConfig) networks() []string {
  505. var networks []string
  506. for name := range c.networksAttachments {
  507. networks = append(networks, name)
  508. }
  509. return networks
  510. }
  511. func (c *containerConfig) networkCreateRequest(name string) (clustertypes.NetworkCreateRequest, error) {
  512. na, ok := c.networksAttachments[name]
  513. if !ok {
  514. return clustertypes.NetworkCreateRequest{}, errors.New("container: unknown network referenced")
  515. }
  516. options := types.NetworkCreate{
  517. // ID: na.Network.ID,
  518. Driver: na.Network.DriverState.Name,
  519. IPAM: &network.IPAM{
  520. Driver: na.Network.IPAM.Driver.Name,
  521. },
  522. Options: na.Network.DriverState.Options,
  523. Labels: na.Network.Spec.Annotations.Labels,
  524. Internal: na.Network.Spec.Internal,
  525. EnableIPv6: na.Network.Spec.Ipv6Enabled,
  526. CheckDuplicate: true,
  527. }
  528. for _, ic := range na.Network.IPAM.Configs {
  529. c := network.IPAMConfig{
  530. Subnet: ic.Subnet,
  531. IPRange: ic.Range,
  532. Gateway: ic.Gateway,
  533. }
  534. options.IPAM.Config = append(options.IPAM.Config, c)
  535. }
  536. return clustertypes.NetworkCreateRequest{na.Network.ID, types.NetworkCreateRequest{Name: name, NetworkCreate: options}}, nil
  537. }
  538. func (c containerConfig) eventFilter() filters.Args {
  539. filter := filters.NewArgs()
  540. filter.Add("type", events.ContainerEventType)
  541. filter.Add("name", c.name())
  542. filter.Add("label", fmt.Sprintf("%v.task.id=%v", systemLabelPrefix, c.task.ID))
  543. return filter
  544. }