daemon: oci: obey CL_UNPRIVILEGED for user namespaced daemon
When runc is bind-mounting a particular path "with options", it has to do so by first creating a bind-mount and the modifying the options of said bind-mount via remount. However, in a user namespace, there are restrictions on which flags you can change with a remount (due to CL_UNPRIVILEGED being set in this instance). Docker historically has ignored this, and as a result, internal Docker mounts (such as secrets) haven't worked with --userns-remap. Fix this by preserving CL_UNPRIVILEGED mount flags when Docker is spawning containers with user namespaces enabled. Ref: https://github.com/opencontainers/runc/pull/1603 Signed-off-by: Aleksa Sarai <asarai@suse.de>
This commit is contained in:
parent
f2afa26235
commit
c0f883fdee
1 changed files with 46 additions and 0 deletions
|
@ -26,6 +26,7 @@ import (
|
|||
"github.com/opencontainers/runc/libcontainer/user"
|
||||
specs "github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// nolint: gosimple
|
||||
|
@ -469,6 +470,38 @@ func ensureSharedOrSlave(path string) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// Get the set of mount flags that are set on the mount that contains the given
|
||||
// path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
|
||||
// bind-mounting "with options" will not fail with user namespaces, due to
|
||||
// kernel restrictions that require user namespace mounts to preserve
|
||||
// CL_UNPRIVILEGED locked flags.
|
||||
func getUnprivilegedMountFlags(path string) ([]string, error) {
|
||||
var statfs unix.Statfs_t
|
||||
if err := unix.Statfs(path, &statfs); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
|
||||
unprivilegedFlags := map[uint64]string{
|
||||
unix.MS_RDONLY: "ro",
|
||||
unix.MS_NODEV: "nodev",
|
||||
unix.MS_NOEXEC: "noexec",
|
||||
unix.MS_NOSUID: "nosuid",
|
||||
unix.MS_NOATIME: "noatime",
|
||||
unix.MS_RELATIME: "relatime",
|
||||
unix.MS_NODIRATIME: "nodiratime",
|
||||
}
|
||||
|
||||
var flags []string
|
||||
for mask, flag := range unprivilegedFlags {
|
||||
if uint64(statfs.Flags)&mask == mask {
|
||||
flags = append(flags, flag)
|
||||
}
|
||||
}
|
||||
|
||||
return flags, nil
|
||||
}
|
||||
|
||||
var (
|
||||
mountPropagationMap = map[string]int{
|
||||
"private": mount.PRIVATE,
|
||||
|
@ -575,6 +608,19 @@ func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []c
|
|||
opts = append(opts, mountPropagationReverseMap[pFlag])
|
||||
}
|
||||
|
||||
// If we are using user namespaces, then we must make sure that we
|
||||
// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
|
||||
// "mount" when we bind-mount. The reason for this is that at the point
|
||||
// when runc sets up the root filesystem, it is already inside a user
|
||||
// namespace, and thus cannot change any flags that are locked.
|
||||
if daemon.configStore.RemappedRoot != "" {
|
||||
unprivOpts, err := getUnprivilegedMountFlags(m.Source)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
opts = append(opts, unprivOpts...)
|
||||
}
|
||||
|
||||
mt.Options = opts
|
||||
s.Mounts = append(s.Mounts, mt)
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue