From f178c2ab15e555d058e06aa85d03375ccb577940 Mon Sep 17 00:00:00 2001 From: Cory Snider Date: Mon, 5 Jun 2023 18:30:30 -0400 Subject: [PATCH 1/5] daemon: modernize oci_linux_test.go Switch to using t.TempDir() instead of rolling our own. Clean up mounts leaked by the tests as otherwise the tests fail due to the leaked mounts because unlike the old cleanup code, t.TempDir() cleanup does not ignore errors from os.RemoveAll. Signed-off-by: Cory Snider (cherry picked from commit 9ff169ccf421c00f3106481e43c5d86f77403b06) Signed-off-by: Sebastiaan van Stijn --- daemon/oci_linux_test.go | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/daemon/oci_linux_test.go b/daemon/oci_linux_test.go index 42084c900d..37e0b7526c 100644 --- a/daemon/oci_linux_test.go +++ b/daemon/oci_linux_test.go @@ -11,17 +11,18 @@ import ( "github.com/docker/docker/daemon/network" "github.com/docker/docker/libnetwork" "github.com/docker/docker/pkg/containerfs" + "golang.org/x/sys/unix" "gotest.tools/v3/assert" is "gotest.tools/v3/assert/cmp" "gotest.tools/v3/skip" ) func setupFakeDaemon(t *testing.T, c *container.Container) *Daemon { - root, err := os.MkdirTemp("", "oci_linux_test-root") - assert.NilError(t, err) + t.Helper() + root := t.TempDir() rootfs := filepath.Join(root, "rootfs") - err = os.MkdirAll(rootfs, 0755) + err := os.MkdirAll(rootfs, 0755) assert.NilError(t, err) netController, err := libnetwork.New() @@ -48,11 +49,19 @@ func setupFakeDaemon(t *testing.T, c *container.Container) *Daemon { c.NetworkSettings = &network.Settings{Networks: make(map[string]*network.EndpointSettings)} } - return d -} + // HORRIBLE HACK: clean up shm mounts leaked by some tests. Otherwise the + // offending tests would fail due to the mounts blocking the temporary + // directory from being cleaned up. + t.Cleanup(func() { + if c.ShmPath != "" { + var err error + for err == nil { // Some tests over-mount over the same path multiple times. + err = unix.Unmount(c.ShmPath, unix.MNT_DETACH) + } + } + }) -func cleanupFakeContainer(c *container.Container) { - _ = os.RemoveAll(c.Root) + return d } // TestTmpfsDevShmNoDupMount checks that a user-specified /dev/shm tmpfs @@ -72,7 +81,6 @@ func TestTmpfsDevShmNoDupMount(t *testing.T) { }, } d := setupFakeDaemon(t, c) - defer cleanupFakeContainer(c) _, err := d.createSpec(c) assert.Check(t, err) @@ -91,7 +99,6 @@ func TestIpcPrivateVsReadonly(t *testing.T) { }, } d := setupFakeDaemon(t, c) - defer cleanupFakeContainer(c) s, err := d.createSpec(c) assert.Check(t, err) @@ -120,7 +127,6 @@ func TestSysctlOverride(t *testing.T) { }, } d := setupFakeDaemon(t, c) - defer cleanupFakeContainer(c) // Ensure that the implicit sysctl is set correctly. s, err := d.createSpec(c) @@ -172,7 +178,6 @@ func TestSysctlOverrideHost(t *testing.T) { }, } d := setupFakeDaemon(t, c) - defer cleanupFakeContainer(c) // Ensure that the implicit sysctl is not set s, err := d.createSpec(c) From 4949db7f6266ab3d6a00da10340811d5c5f014dd Mon Sep 17 00:00:00 2001 From: Cory Snider Date: Mon, 5 Jun 2023 18:44:51 -0400 Subject: [PATCH 2/5] daemon: stop setting container resources to zero Many of the fields in LinuxResources struct are pointers to scalars for some reason, presumably to differentiate between set-to-zero and unset when unmarshaling from JSON, despite zero being outside the acceptable range for the corresponding kernel tunables. When creating the OCI spec for a container, the daemon sets the container's OCI spec CPUShares and BlkioWeight parameters to zero when the corresponding Docker container configuration values are zero, signifying unset, despite the minimum acceptable value for CPUShares being two, and BlkioWeight ten. This has gone unnoticed as runC does not distingiush set-to-zero from unset as it also uses zero internally to represent unset for those fields. However, kata-containers v3.2.0-alpha.3 tries to apply the explicit-zero resource parameters to the container, exactly as instructed, and fails loudly. The OCI runtime-spec is silent on how the runtime should handle the case when those parameters are explicitly set to out-of-range values and kata's behaviour is not unreasonable, so the daemon must therefore be in the wrong. Translate unset values in the Docker container's resources HostConfig to omit the corresponding fields in the container's OCI spec when starting and updating a container in order to maximize compatibility with runtimes. Signed-off-by: Cory Snider (cherry picked from commit dea870f4eafac1efb1d3a70d8196da11bfa6c59c) Signed-off-by: Sebastiaan van Stijn --- daemon/daemon_unix.go | 12 +++++-- daemon/oci_linux.go | 6 ++-- daemon/oci_linux_test.go | 37 ++++++++++++++++++++ daemon/update_linux.go | 50 +++++++++++++++++++--------- daemon/update_linux_test.go | 11 ++++++ libcontainerd/remote/client_linux.go | 4 +-- libcontainerd/types/types_linux.go | 2 +- 7 files changed, 98 insertions(+), 24 deletions(-) create mode 100644 daemon/update_linux_test.go diff --git a/daemon/daemon_unix.go b/daemon/daemon_unix.go index a252b15f71..c589c5ea88 100644 --- a/daemon/daemon_unix.go +++ b/daemon/daemon_unix.go @@ -109,7 +109,10 @@ func getMemoryResources(config containertypes.Resources) *specs.LinuxMemory { memory.KernelTCP = &config.KernelMemoryTCP } - return &memory + if memory != (specs.LinuxMemory{}) { + return &memory + } + return nil } func getPidsLimit(config containertypes.Resources) *specs.LinuxPids { @@ -131,7 +134,7 @@ func getCPUResources(config containertypes.Resources) (*specs.LinuxCPU, error) { if config.CPUShares < 0 { return nil, fmt.Errorf("shares: invalid argument") } - if config.CPUShares >= 0 { + if config.CPUShares > 0 { shares := uint64(config.CPUShares) cpu.Shares = &shares } @@ -172,7 +175,10 @@ func getCPUResources(config containertypes.Resources) (*specs.LinuxCPU, error) { cpu.RealtimeRuntime = &c } - return &cpu, nil + if cpu != (specs.LinuxCPU{}) { + return &cpu, nil + } + return nil, nil } func getBlkioWeightDevices(config containertypes.Resources) ([]specs.LinuxWeightDevice, error) { diff --git a/daemon/oci_linux.go b/daemon/oci_linux.go index ae2a4512b1..95ab269079 100644 --- a/daemon/oci_linux.go +++ b/daemon/oci_linux.go @@ -959,13 +959,11 @@ func WithResources(c *container.Container) coci.SpecOpts { if err != nil { return err } - blkioWeight := r.BlkioWeight specResources := &specs.LinuxResources{ Memory: memoryRes, CPU: cpuRes, BlockIO: &specs.LinuxBlockIO{ - Weight: &blkioWeight, WeightDevice: weightDevices, ThrottleReadBpsDevice: readBpsDevice, ThrottleWriteBpsDevice: writeBpsDevice, @@ -974,6 +972,10 @@ func WithResources(c *container.Container) coci.SpecOpts { }, Pids: getPidsLimit(r), } + if r.BlkioWeight != 0 { + w := r.BlkioWeight + specResources.BlockIO.Weight = &w + } if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 { specResources.Devices = s.Linux.Resources.Devices diff --git a/daemon/oci_linux_test.go b/daemon/oci_linux_test.go index 37e0b7526c..cfc898c389 100644 --- a/daemon/oci_linux_test.go +++ b/daemon/oci_linux_test.go @@ -11,6 +11,8 @@ import ( "github.com/docker/docker/daemon/network" "github.com/docker/docker/libnetwork" "github.com/docker/docker/pkg/containerfs" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" "gotest.tools/v3/assert" is "gotest.tools/v3/assert/cmp" @@ -205,3 +207,38 @@ func TestGetSourceMount(t *testing.T) { _, _, err = getSourceMount(cwd) assert.NilError(t, err) } + +func TestDefaultResources(t *testing.T) { + skip.If(t, os.Getuid() != 0, "skipping test that requires root") // TODO: is this actually true? I'm guilty of following the cargo cult here. + + c := &container.Container{ + HostConfig: &containertypes.HostConfig{ + IpcMode: containertypes.IPCModeNone, + }, + } + d := setupFakeDaemon(t, c) + + s, err := d.createSpec(c) + assert.NilError(t, err) + checkResourcesAreUnset(t, s.Linux.Resources) +} + +func checkResourcesAreUnset(t *testing.T, r *specs.LinuxResources) { + t.Helper() + + if r != nil { + if r.Memory != nil { + assert.Check(t, is.DeepEqual(r.Memory, &specs.LinuxMemory{})) + } + if r.CPU != nil { + assert.Check(t, is.DeepEqual(r.CPU, &specs.LinuxCPU{})) + } + assert.Check(t, is.Nil(r.Pids)) + if r.BlockIO != nil { + assert.Check(t, is.DeepEqual(r.BlockIO, &specs.LinuxBlockIO{}, cmpopts.EquateEmpty())) + } + if r.Network != nil { + assert.Check(t, is.DeepEqual(r.Network, &specs.LinuxNetwork{}, cmpopts.EquateEmpty())) + } + } +} diff --git a/daemon/update_linux.go b/daemon/update_linux.go index c1d3684868..3105402e3c 100644 --- a/daemon/update_linux.go +++ b/daemon/update_linux.go @@ -11,15 +11,19 @@ import ( func toContainerdResources(resources container.Resources) *libcontainerdtypes.Resources { var r libcontainerdtypes.Resources - r.BlockIO = &specs.LinuxBlockIO{ - Weight: &resources.BlkioWeight, + if resources.BlkioWeight != 0 { + r.BlockIO = &specs.LinuxBlockIO{ + Weight: &resources.BlkioWeight, + } } - shares := uint64(resources.CPUShares) - r.CPU = &specs.LinuxCPU{ - Shares: &shares, - Cpus: resources.CpusetCpus, - Mems: resources.CpusetMems, + cpu := specs.LinuxCPU{ + Cpus: resources.CpusetCpus, + Mems: resources.CpusetMems, + } + if resources.CPUShares != 0 { + shares := uint64(resources.CPUShares) + cpu.Shares = &shares } var ( @@ -37,17 +41,33 @@ func toContainerdResources(resources container.Resources) *libcontainerdtypes.Re period = uint64(resources.CPUPeriod) } - r.CPU.Period = &period - r.CPU.Quota = "a - - r.Memory = &specs.LinuxMemory{ - Limit: &resources.Memory, - Reservation: &resources.MemoryReservation, - Kernel: &resources.KernelMemory, + if period != 0 { + cpu.Period = &period + } + if quota != 0 { + cpu.Quota = "a } + if cpu != (specs.LinuxCPU{}) { + r.CPU = &cpu + } + + var memory specs.LinuxMemory + if resources.Memory != 0 { + memory.Limit = &resources.Memory + } + if resources.MemoryReservation != 0 { + memory.Reservation = &resources.MemoryReservation + } + if resources.KernelMemory != 0 { + memory.Kernel = &resources.KernelMemory + } if resources.MemorySwap > 0 { - r.Memory.Swap = &resources.MemorySwap + memory.Swap = &resources.MemorySwap + } + + if memory != (specs.LinuxMemory{}) { + r.Memory = &memory } r.Pids = getPidsLimit(resources) diff --git a/daemon/update_linux_test.go b/daemon/update_linux_test.go new file mode 100644 index 0000000000..4817c1eab7 --- /dev/null +++ b/daemon/update_linux_test.go @@ -0,0 +1,11 @@ +package daemon // import "github.com/docker/docker/daemon" + +import ( + "testing" + + "github.com/docker/docker/api/types/container" +) + +func TestToContainerdResources_Defaults(t *testing.T) { + checkResourcesAreUnset(t, toContainerdResources(container.Resources{})) +} diff --git a/libcontainerd/remote/client_linux.go b/libcontainerd/remote/client_linux.go index e45d140b2f..85a96579f7 100644 --- a/libcontainerd/remote/client_linux.go +++ b/libcontainerd/remote/client_linux.go @@ -26,9 +26,7 @@ func (c *client) UpdateResources(ctx context.Context, containerID string, resour return err } - // go doesn't like the alias in 1.8, this means this need to be - // platform specific - return p.(containerd.Task).Update(ctx, containerd.WithResources((*specs.LinuxResources)(resources))) + return p.(containerd.Task).Update(ctx, containerd.WithResources(resources)) } func hostIDFromMap(id uint32, mp []specs.LinuxIDMapping) int { diff --git a/libcontainerd/types/types_linux.go b/libcontainerd/types/types_linux.go index 6b05385221..3433bda055 100644 --- a/libcontainerd/types/types_linux.go +++ b/libcontainerd/types/types_linux.go @@ -27,7 +27,7 @@ func InterfaceToStats(read time.Time, v interface{}) *Stats { } // Resources defines updatable container resource values. TODO: it must match containerd upcoming API -type Resources specs.LinuxResources +type Resources = specs.LinuxResources // Checkpoints contains the details of a checkpoint type Checkpoints struct{} From 5cc173641855d1e847f48d56a497d6dce4646afa Mon Sep 17 00:00:00 2001 From: Cory Snider Date: Tue, 6 Jun 2023 12:57:38 -0400 Subject: [PATCH 3/5] daemon: ensure OCI options play nicely together Audit the OCI spec options used for Linux containers to ensure they are less order-dependent. Ensure they don't assume that any pointer fields are non-nil and that they don't unintentionally clobber mutations to the spec applied by other options. Signed-off-by: Cory Snider (cherry picked from commit 8a094fe60913fed77deb8275207368617f16328f) Signed-off-by: Sebastiaan van Stijn --- daemon/oci_linux.go | 90 +++++++++++++++++++------ daemon/oci_opts.go | 3 + daemon/oci_utils.go | 7 +- daemon/seccomp_linux.go | 4 ++ oci/namespaces.go | 3 + oci/oci.go | 3 + pkg/rootless/specconv/specconv_linux.go | 31 +++++---- 7 files changed, 108 insertions(+), 33 deletions(-) diff --git a/daemon/oci_linux.go b/daemon/oci_linux.go index 95ab269079..827ade009f 100644 --- a/daemon/oci_linux.go +++ b/daemon/oci_linux.go @@ -54,6 +54,9 @@ func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts { }) } + if s.Process == nil { + s.Process = &specs.Process{} + } s.Process.Rlimits = rlimits return nil } @@ -114,6 +117,9 @@ func WithRootless(daemon *Daemon) coci.SpecOpts { // WithOOMScore sets the oom score func WithOOMScore(score *int) coci.SpecOpts { return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { + if s.Process == nil { + s.Process = &specs.Process{} + } s.Process.OOMScoreAdj = score return nil } @@ -122,6 +128,12 @@ func WithOOMScore(score *int) coci.SpecOpts { // WithSelinux sets the selinux labels func WithSelinux(c *container.Container) coci.SpecOpts { return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { + if s.Process == nil { + s.Process = &specs.Process{} + } + if s.Linux == nil { + s.Linux = &specs.Linux{} + } s.Process.SelinuxLabel = c.GetProcessLabel() s.Linux.MountLabel = c.MountLabel return nil @@ -152,6 +164,9 @@ func WithApparmor(c *container.Container) coci.SpecOpts { return err } } + if s.Process == nil { + s.Process = &specs.Process{} + } s.Process.ApparmorProfile = appArmorProfile } return nil @@ -214,6 +229,10 @@ func getUser(c *container.Container, username string) (specs.User, error) { } func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) { + if s.Linux == nil { + s.Linux = &specs.Linux{} + } + for i, n := range s.Linux.Namespaces { if n.Type == ns.Type { s.Linux.Namespaces[i] = ns @@ -608,6 +627,9 @@ func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts { } rootpg := mountPropagationMap[s.Linux.RootfsPropagation] if rootpg != mount.SHARED && rootpg != mount.RSHARED { + if s.Linux == nil { + s.Linux = &specs.Linux{} + } s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED] } case mount.SLAVE, mount.RSLAVE: @@ -636,6 +658,9 @@ func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts { if !fallback { rootpg := mountPropagationMap[s.Linux.RootfsPropagation] if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE { + if s.Linux == nil { + s.Linux = &specs.Linux{} + } s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE] } } @@ -691,8 +716,10 @@ func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts { clearReadOnly(&s.Mounts[i]) } } - s.Linux.ReadonlyPaths = nil - s.Linux.MaskedPaths = nil + if s.Linux != nil { + s.Linux.ReadonlyPaths = nil + s.Linux.MaskedPaths = nil + } } // TODO: until a kernel/mount solution exists for handling remount in a user namespace, @@ -738,6 +765,9 @@ func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts { if len(cwd) == 0 { cwd = "/" } + if s.Process == nil { + s.Process = &specs.Process{} + } s.Process.Args = append([]string{c.Path}, c.Args...) // only add the custom init if it is specified and the container is running in its @@ -817,6 +847,9 @@ func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts { } else { cgroupsPath = filepath.Join(parent, c.ID) } + if s.Linux == nil { + s.Linux = &specs.Linux{} + } s.Linux.CgroupsPath = cgroupsPath // the rest is only needed for CPU RT controller @@ -917,8 +950,14 @@ func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts { } } + if s.Linux == nil { + s.Linux = &specs.Linux{} + } + if s.Linux.Resources == nil { + s.Linux.Resources = &specs.LinuxResources{} + } s.Linux.Devices = append(s.Linux.Devices, devs...) - s.Linux.Resources.Devices = devPermissions + s.Linux.Resources.Devices = append(s.Linux.Resources.Devices, devPermissions...) for _, req := range c.HostConfig.DeviceRequests { if err := daemon.handleDevice(req, s); err != nil { @@ -960,28 +999,27 @@ func WithResources(c *container.Container) coci.SpecOpts { return err } - specResources := &specs.LinuxResources{ - Memory: memoryRes, - CPU: cpuRes, - BlockIO: &specs.LinuxBlockIO{ - WeightDevice: weightDevices, - ThrottleReadBpsDevice: readBpsDevice, - ThrottleWriteBpsDevice: writeBpsDevice, - ThrottleReadIOPSDevice: readIOpsDevice, - ThrottleWriteIOPSDevice: writeIOpsDevice, - }, - Pids: getPidsLimit(r), + if s.Linux == nil { + s.Linux = &specs.Linux{} + } + if s.Linux.Resources == nil { + s.Linux.Resources = &specs.LinuxResources{} + } + s.Linux.Resources.Memory = memoryRes + s.Linux.Resources.CPU = cpuRes + s.Linux.Resources.BlockIO = &specs.LinuxBlockIO{ + WeightDevice: weightDevices, + ThrottleReadBpsDevice: readBpsDevice, + ThrottleWriteBpsDevice: writeBpsDevice, + ThrottleReadIOPSDevice: readIOpsDevice, + ThrottleWriteIOPSDevice: writeIOpsDevice, } if r.BlkioWeight != 0 { w := r.BlkioWeight - specResources.BlockIO.Weight = &w + s.Linux.Resources.BlockIO.Weight = &w } + s.Linux.Resources.Pids = getPidsLimit(r) - if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 { - specResources.Devices = s.Linux.Resources.Devices - } - - s.Linux.Resources = specResources return nil } } @@ -989,6 +1027,15 @@ func WithResources(c *container.Container) coci.SpecOpts { // WithSysctls sets the container's sysctls func WithSysctls(c *container.Container) coci.SpecOpts { return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { + if len(c.HostConfig.Sysctls) == 0 { + return nil + } + if s.Linux == nil { + s.Linux = &specs.Linux{} + } + if s.Linux.Sysctl == nil { + s.Linux.Sysctl = make(map[string]string) + } // We merge the sysctls injected above with the HostConfig (latter takes // precedence for backwards-compatibility reasons). for k, v := range c.HostConfig.Sysctls { @@ -1001,6 +1048,9 @@ func WithSysctls(c *container.Container) coci.SpecOpts { // WithUser sets the container's user func WithUser(c *container.Container) coci.SpecOpts { return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { + if s.Process == nil { + s.Process = &specs.Process{} + } var err error s.Process.User, err = getUser(c, c.Config.User) return err diff --git a/daemon/oci_opts.go b/daemon/oci_opts.go index c824999d50..c8b1b633b6 100644 --- a/daemon/oci_opts.go +++ b/daemon/oci_opts.go @@ -13,6 +13,9 @@ import ( func WithConsoleSize(c *container.Container) coci.SpecOpts { return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { if c.HostConfig.ConsoleSize[0] > 0 || c.HostConfig.ConsoleSize[1] > 0 { + if s.Process == nil { + s.Process = &specs.Process{} + } s.Process.ConsoleSize = &specs.Box{ Height: c.HostConfig.ConsoleSize[0], Width: c.HostConfig.ConsoleSize[1], diff --git a/daemon/oci_utils.go b/daemon/oci_utils.go index 2d833502bd..a47f7bab44 100644 --- a/daemon/oci_utils.go +++ b/daemon/oci_utils.go @@ -9,7 +9,12 @@ func setLinuxDomainname(c *container.Container, s *specs.Spec) { // There isn't a field in the OCI for the NIS domainname, but luckily there // is a sysctl which has an identical effect to setdomainname(2) so there's // no explicit need for runtime support. - s.Linux.Sysctl = make(map[string]string) + if s.Linux == nil { + s.Linux = &specs.Linux{} + } + if s.Linux.Sysctl == nil { + s.Linux.Sysctl = make(map[string]string) + } if c.Config.Domainname != "" { s.Linux.Sysctl["kernel.domainname"] = c.Config.Domainname } diff --git a/daemon/seccomp_linux.go b/daemon/seccomp_linux.go index 8336b00392..2e3c37818e 100644 --- a/daemon/seccomp_linux.go +++ b/daemon/seccomp_linux.go @@ -9,6 +9,7 @@ import ( "github.com/docker/docker/container" dconfig "github.com/docker/docker/daemon/config" "github.com/docker/docker/profiles/seccomp" + specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" ) @@ -31,6 +32,9 @@ func WithSeccomp(daemon *Daemon, c *container.Container) coci.SpecOpts { c.SeccompProfile = dconfig.SeccompProfileUnconfined return nil } + if s.Linux == nil { + s.Linux = &specs.Linux{} + } var err error switch { case c.SeccompProfile == dconfig.SeccompProfileDefault: diff --git a/oci/namespaces.go b/oci/namespaces.go index f32e489b4a..851edd61ef 100644 --- a/oci/namespaces.go +++ b/oci/namespaces.go @@ -4,6 +4,9 @@ import specs "github.com/opencontainers/runtime-spec/specs-go" // RemoveNamespace removes the `nsType` namespace from OCI spec `s` func RemoveNamespace(s *specs.Spec, nsType specs.LinuxNamespaceType) { + if s.Linux == nil { + return + } for i, n := range s.Linux.Namespaces { if n.Type == nsType { s.Linux.Namespaces = append(s.Linux.Namespaces[:i], s.Linux.Namespaces[i+1:]...) diff --git a/oci/oci.go b/oci/oci.go index 2021ec3538..864ccf5b60 100644 --- a/oci/oci.go +++ b/oci/oci.go @@ -20,6 +20,9 @@ var deviceCgroupRuleRegex = regexp.MustCompile("^([acb]) ([0-9]+|\\*):([0-9]+|\\ // SetCapabilities sets the provided capabilities on the spec // All capabilities are added if privileged is true. func SetCapabilities(s *specs.Spec, caplist []string) error { + if s.Process == nil { + s.Process = &specs.Process{} + } // setUser has already been executed here if s.Process.User.UID == 0 { s.Process.Capabilities = &specs.LinuxCapabilities{ diff --git a/pkg/rootless/specconv/specconv_linux.go b/pkg/rootless/specconv/specconv_linux.go index b706b5ca6a..06f55ef13d 100644 --- a/pkg/rootless/specconv/specconv_linux.go +++ b/pkg/rootless/specconv/specconv_linux.go @@ -40,11 +40,13 @@ func getCurrentOOMScoreAdj() int { func toRootless(spec *specs.Spec, v2Controllers []string, currentOOMScoreAdj int) error { if len(v2Controllers) == 0 { - // Remove cgroup settings. - spec.Linux.Resources = nil - spec.Linux.CgroupsPath = "" + if spec.Linux != nil { + // Remove cgroup settings. + spec.Linux.Resources = nil + spec.Linux.CgroupsPath = "" + } } else { - if spec.Linux.Resources != nil { + if spec.Linux != nil && spec.Linux.Resources != nil { m := make(map[string]struct{}) for _, s := range v2Controllers { m[s] = struct{}{} @@ -77,7 +79,7 @@ func toRootless(spec *specs.Spec, v2Controllers []string, currentOOMScoreAdj int } } - if spec.Process.OOMScoreAdj != nil && *spec.Process.OOMScoreAdj < currentOOMScoreAdj { + if spec.Process != nil && spec.Process.OOMScoreAdj != nil && *spec.Process.OOMScoreAdj < currentOOMScoreAdj { *spec.Process.OOMScoreAdj = currentOOMScoreAdj } @@ -110,6 +112,9 @@ func isHostNS(spec *specs.Spec, nsType specs.LinuxNamespaceType) (bool, error) { if strings.Contains(string(nsType), string(os.PathSeparator)) { return false, fmt.Errorf("unexpected namespace type %q", nsType) } + if spec.Linux == nil { + return false, nil + } for _, ns := range spec.Linux.Namespaces { if ns.Type == nsType { if ns.Path == "" { @@ -144,15 +149,17 @@ func bindMountHostProcfs(spec *specs.Spec) error { } } - // Remove ReadonlyPaths for /proc/* - newROP := spec.Linux.ReadonlyPaths[:0] - for _, s := range spec.Linux.ReadonlyPaths { - s = path.Clean(s) - if !strings.HasPrefix(s, "/proc/") { - newROP = append(newROP, s) + if spec.Linux != nil { + // Remove ReadonlyPaths for /proc/* + newROP := spec.Linux.ReadonlyPaths[:0] + for _, s := range spec.Linux.ReadonlyPaths { + s = path.Clean(s) + if !strings.HasPrefix(s, "/proc/") { + newROP = append(newROP, s) + } } + spec.Linux.ReadonlyPaths = newROP } - spec.Linux.ReadonlyPaths = newROP return nil } From 09b65e0082f07fe6d80ffbe247f7f6f1ff7613d8 Mon Sep 17 00:00:00 2001 From: Luboslav Pivarc Date: Wed, 10 May 2023 10:09:21 +0200 Subject: [PATCH 4/5] Do not drop effective&permitted set Currently moby drops ep sets before the entrypoint is executed. This does mean that with combination of no-new-privileges the file capabilities stops working with non-root containers. This is undesired as the usability of such containers is harmed comparing to running root containers. This commit therefore sets the effective/permitted set in order to allow use of file capabilities or libcap(3)/prctl(2) respectively with combination of no-new-privileges and without respectively. For no-new-privileges the container will be able to obtain capabilities that are requested. Signed-off-by: Luboslav Pivarc Signed-off-by: Bjorn Neergaard (cherry picked from commit 3aef732e61ec8ae0ea0bd8ad31116194e0fc21a6) Signed-off-by: Sebastiaan van Stijn --- oci/oci.go | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/oci/oci.go b/oci/oci.go index 864ccf5b60..45ed7979ee 100644 --- a/oci/oci.go +++ b/oci/oci.go @@ -23,19 +23,10 @@ func SetCapabilities(s *specs.Spec, caplist []string) error { if s.Process == nil { s.Process = &specs.Process{} } - // setUser has already been executed here - if s.Process.User.UID == 0 { - s.Process.Capabilities = &specs.LinuxCapabilities{ - Effective: caplist, - Bounding: caplist, - Permitted: caplist, - } - } else { - // Do not set Effective and Permitted capabilities for non-root users, - // to match what execve does. - s.Process.Capabilities = &specs.LinuxCapabilities{ - Bounding: caplist, - } + s.Process.Capabilities = &specs.LinuxCapabilities{ + Effective: caplist, + Bounding: caplist, + Permitted: caplist, } return nil } From 89a731096dc5e4fc1e6e8840ca43f3978a7b53c5 Mon Sep 17 00:00:00 2001 From: Luboslav Pivarc Date: Mon, 17 Jul 2023 11:48:02 +0200 Subject: [PATCH 5/5] Integration test for capabilities Verify non-root containers are able to use file capabilities. Signed-off-by: Luboslav Pivarc Co-authored-by: Cory Snider Signed-off-by: Cory Snider (cherry picked from commit 42fa7a1951f192a0038b904f529e86beeb056093) Signed-off-by: Sebastiaan van Stijn --- .../capabilities/capabilities_linux_test.go | 108 ++++++++++++++++++ integration/capabilities/main_linux_test.go | 33 ++++++ integration/internal/container/ops.go | 18 +++ 3 files changed, 159 insertions(+) create mode 100644 integration/capabilities/capabilities_linux_test.go create mode 100644 integration/capabilities/main_linux_test.go diff --git a/integration/capabilities/capabilities_linux_test.go b/integration/capabilities/capabilities_linux_test.go new file mode 100644 index 0000000000..272f3dcdb7 --- /dev/null +++ b/integration/capabilities/capabilities_linux_test.go @@ -0,0 +1,108 @@ +package capabilities + +import ( + "bytes" + "context" + "io" + "strings" + "testing" + "time" + + "github.com/docker/docker/api/types" + "github.com/docker/docker/integration/internal/container" + "github.com/docker/docker/pkg/stdcopy" + "github.com/docker/docker/testutil/fakecontext" + + "gotest.tools/v3/assert" + "gotest.tools/v3/poll" +) + +func TestNoNewPrivileges(t *testing.T) { + defer setupTest(t)() + + withFileCapability := ` + FROM debian:bullseye-slim + RUN apt-get update && apt-get install -y libcap2-bin --no-install-recommends + RUN setcap CAP_DAC_OVERRIDE=+eip /bin/cat + RUN echo "hello" > /txt && chown 0:0 /txt && chmod 700 /txt + RUN useradd -u 1500 test + ` + imageTag := "captest" + + source := fakecontext.New(t, "", fakecontext.WithDockerfile(withFileCapability)) + defer source.Close() + + client := testEnv.APIClient() + + // Build image + ctx := context.TODO() + resp, err := client.ImageBuild(ctx, + source.AsTarReader(t), + types.ImageBuildOptions{ + Tags: []string{imageTag}, + }) + assert.NilError(t, err) + _, err = io.Copy(io.Discard, resp.Body) + assert.NilError(t, err) + resp.Body.Close() + + testCases := []struct { + doc string + opts []func(*container.TestContainerConfig) + stdOut, stdErr string + }{ + { + doc: "CapabilityRequested=true", + opts: []func(*container.TestContainerConfig){ + container.WithUser("test"), + container.WithCapability("CAP_DAC_OVERRIDE"), + }, + stdOut: "hello", + }, + { + doc: "CapabilityRequested=false", + opts: []func(*container.TestContainerConfig){ + container.WithUser("test"), + container.WithDropCapability("CAP_DAC_OVERRIDE"), + }, + stdErr: "exec /bin/cat: operation not permitted", + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.doc, func(t *testing.T) { + // Run the container with the image + opts := append(tc.opts, + container.WithImage(imageTag), + container.WithCmd("/bin/cat", "/txt"), + container.WithSecurityOpt("no-new-privileges=true"), + ) + cid := container.Run(ctx, t, client, opts...) + poll.WaitOn(t, container.IsInState(ctx, client, cid, "exited"), poll.WithDelay(100*time.Millisecond)) + + // Assert on outputs + logReader, err := client.ContainerLogs(ctx, cid, types.ContainerLogsOptions{ + ShowStdout: true, + ShowStderr: true, + }) + assert.NilError(t, err) + defer logReader.Close() + + var actualStdout, actualStderr bytes.Buffer + _, err = stdcopy.StdCopy(&actualStdout, &actualStderr, logReader) + assert.NilError(t, err) + + stdOut := strings.TrimSpace(actualStdout.String()) + stdErr := strings.TrimSpace(actualStderr.String()) + if stdOut != tc.stdOut { + t.Fatalf("test produced invalid output: %q, expected %q. Stderr:%q", stdOut, tc.stdOut, stdErr) + } + if stdErr != tc.stdErr { + t.Fatalf("test produced invalid error: %q, expected %q. Stdout:%q", stdErr, tc.stdErr, stdOut) + + } + }) + } + +} diff --git a/integration/capabilities/main_linux_test.go b/integration/capabilities/main_linux_test.go new file mode 100644 index 0000000000..0f074dd156 --- /dev/null +++ b/integration/capabilities/main_linux_test.go @@ -0,0 +1,33 @@ +package capabilities + +import ( + "fmt" + "os" + "testing" + + "github.com/docker/docker/testutil/environment" +) + +var testEnv *environment.Execution + +func TestMain(m *testing.M) { + var err error + testEnv, err = environment.New() + if err != nil { + fmt.Println(err) + os.Exit(1) + } + err = environment.EnsureFrozenImagesLinux(testEnv) + if err != nil { + fmt.Println(err) + os.Exit(1) + } + + testEnv.Print() + os.Exit(m.Run()) +} + +func setupTest(t *testing.T) func() { + environment.ProtectAll(t, testEnv) + return func() { testEnv.Clean(t) } +} diff --git a/integration/internal/container/ops.go b/integration/internal/container/ops.go index f3101a816c..bb4a8305ec 100644 --- a/integration/internal/container/ops.go +++ b/integration/internal/container/ops.go @@ -241,3 +241,21 @@ func WithRuntime(name string) func(*TestContainerConfig) { c.HostConfig.Runtime = name } } + +func WithCapability(capabilities ...string) func(*TestContainerConfig) { + return func(c *TestContainerConfig) { + c.HostConfig.CapAdd = append(c.HostConfig.CapAdd, capabilities...) + } +} + +func WithDropCapability(capabilities ...string) func(*TestContainerConfig) { + return func(c *TestContainerConfig) { + c.HostConfig.CapDrop = append(c.HostConfig.CapDrop, capabilities...) + } +} + +func WithSecurityOpt(opt string) func(*TestContainerConfig) { + return func(c *TestContainerConfig) { + c.HostConfig.SecurityOpt = append(c.HostConfig.SecurityOpt, opt) + } +}