diff --git a/api/server/router/container/container_routes.go b/api/server/router/container/container_routes.go index f5ecba6253..6f146c2855 100644 --- a/api/server/router/container/container_routes.go +++ b/api/server/router/container/container_routes.go @@ -23,6 +23,7 @@ import ( "github.com/docker/docker/api/types/versions" containerpkg "github.com/docker/docker/container" "github.com/docker/docker/errdefs" + "github.com/docker/docker/libnetwork/netlabel" "github.com/docker/docker/pkg/ioutils" "github.com/docker/docker/runconfig" ocispec "github.com/opencontainers/image-spec/specs-go/v1" @@ -619,6 +620,12 @@ func (s *containerRouter) postContainersCreate(ctx context.Context, w http.Respo warnings = append(warnings, warn) } + if warn, err := handleSysctlBC(hostConfig, networkingConfig, version); err != nil { + return err + } else if warn != "" { + warnings = append(warnings, warn) + } + if hostConfig.PidsLimit != nil && *hostConfig.PidsLimit <= 0 { // Don't set a limit if either no limit was specified, or "unlimited" was // explicitly set. @@ -675,7 +682,7 @@ func handleMACAddressBC(config *container.Config, hostConfig *container.HostConf return "", nil } var warning string - if hostConfig.NetworkMode.IsDefault() || hostConfig.NetworkMode.IsBridge() || hostConfig.NetworkMode.IsUserDefined() { + if hostConfig.NetworkMode.IsBridge() || hostConfig.NetworkMode.IsUserDefined() { ep, err := epConfigForNetMode(version, hostConfig.NetworkMode, networkingConfig) if err != nil { return "", errors.Wrap(err, "unable to migrate container-wide MAC address to a specific network") @@ -694,6 +701,94 @@ func handleMACAddressBC(config *container.Config, hostConfig *container.HostConf return warning, nil } +// handleSysctlBC migrates top level network endpoint-specific '--sysctl' +// settings to an DriverOpts for an endpoint. This is necessary because sysctls +// are applied during container task creation, but sysctls that name an interface +// (for example 'net.ipv6.conf.eth0.forwarding') cannot be applied until the +// interface has been created. So, these settings are removed from hostConfig.Sysctls +// and added to DriverOpts[netlabel.EndpointSysctls]. +// +// Because interface names ('ethN') are allocated sequentially, and the order of +// network connections is not deterministic on container restart, only 'eth0' +// would work reliably in a top-level '--sysctl' option, and then only when +// there's a single initial network connection. So, settings for 'eth0' are +// migrated to the primary interface, identified by 'hostConfig.NetworkMode'. +// Settings for other interfaces are treated as errors. +// +// In the DriverOpts, to restrict sysctl settings to those configuring network +// interfaces and because the interface name cannot be determined in advance, a +// short-form of the sysctl name is used. For example, 'net.ipv6.conf.eth0.forwarding' +// becomes 'ipv6.conf.forwarding'. The value in DriverOpts is a comma-separated list +// of these short-form settings. +// +// A warning is generated when settings are migrated. +func handleSysctlBC( + hostConfig *container.HostConfig, + netConfig *network.NetworkingConfig, + version string, +) (string, error) { + if !hostConfig.NetworkMode.IsPrivate() { + return "", nil + } + + var ep *network.EndpointSettings + var toDelete []string + var netIfSysctls []string + for k, v := range hostConfig.Sysctls { + // If the sysctl name matches "net.*.*.eth0.*" ... + if spl := strings.SplitN(k, ".", 5); len(spl) == 5 && spl[0] == "net" && strings.HasPrefix(spl[3], "eth") { + // Transform the name to the endpoint-specific short form. + netIfSysctl := fmt.Sprintf("%s.%s.%s=%s", spl[1], spl[2], spl[4], v) + // Find the EndpointConfig to migrate settings to, if not already found. + if ep == nil { + var err error + ep, err = epConfigForNetMode(version, hostConfig.NetworkMode, netConfig) + if err != nil { + return "", fmt.Errorf("unable to find a network for sysctl %s: %w", k, err) + } + } + // Only try to migrate settings for "eth0", anything else would always + // have behaved unpredictably. + if spl[3] != "eth0" { + return "", fmt.Errorf(`unable to determine network endpoint for sysctl %s, use '--network=name=%s,sysctl=%s' or compose 'driver_opts: "%s":"%s"`, + k, hostConfig.NetworkMode.NetworkName(), netIfSysctl, + netlabel.EndpointSysctls, netIfSysctl) + } + // Prepare the migration. + toDelete = append(toDelete, k) + netIfSysctls = append(netIfSysctls, netIfSysctl) + } + } + if ep == nil { + return "", nil + } + + // TODO(robmry) - refuse to do the migration, generate an error if API > some-future-version. + + newDriverOpt := strings.Join(netIfSysctls, ",") + warning := fmt.Sprintf(`Migrated %s to DriverOpts{"%s":"%s"}. (Use "--network=name=%s,sysctl=%s", or compose "driver_opts".)`, + strings.Join(toDelete, ","), + netlabel.EndpointSysctls, newDriverOpt, + hostConfig.NetworkMode.NetworkName(), strings.Join(netIfSysctls, ",sysctl=")) + + // Append exiting per-endpoint sysctls to the migrated sysctls (give priority + // to per-endpoint settings). + if ep.DriverOpts == nil { + ep.DriverOpts = map[string]string{} + } + if oldDriverOpt, ok := ep.DriverOpts[netlabel.EndpointSysctls]; ok { + newDriverOpt += "," + oldDriverOpt + } + ep.DriverOpts[netlabel.EndpointSysctls] = newDriverOpt + + // Delete migrated settings from the top-level sysctls. + for _, k := range toDelete { + delete(hostConfig.Sysctls, k) + } + + return warning, nil +} + // epConfigForNetMode finds, or creates, an entry in netConfig.EndpointsConfig // corresponding to nwMode. // diff --git a/api/server/router/container/container_routes_test.go b/api/server/router/container/container_routes_test.go index 7c70e1c01f..21bf56a654 100644 --- a/api/server/router/container/container_routes_test.go +++ b/api/server/router/container/container_routes_test.go @@ -1,10 +1,12 @@ package container import ( + "strings" "testing" "github.com/docker/docker/api/types/container" "github.com/docker/docker/api/types/network" + "github.com/docker/docker/libnetwork/netlabel" "gotest.tools/v3/assert" is "gotest.tools/v3/assert/cmp" ) @@ -228,3 +230,85 @@ func TestEpConfigForNetMode(t *testing.T) { }) } } + +func TestHandleSysctlBC(t *testing.T) { + testcases := []struct { + name string + networkMode string + sysctls map[string]string + expEpSysctls []string + expSysctls map[string]string + expWarningContains []string + expError string + }{ + { + name: "migrate to new ep", + networkMode: "mynet", + sysctls: map[string]string{ + "net.ipv6.conf.all.disable_ipv6": "0", + "net.ipv6.conf.eth0.accept_ra": "2", + "net.ipv6.conf.eth0.forwarding": "1", + }, + expSysctls: map[string]string{ + "net.ipv6.conf.all.disable_ipv6": "0", + }, + expEpSysctls: []string{"ipv6.conf.forwarding=1", "ipv6.conf.accept_ra=2"}, + expWarningContains: []string{ + "Migrated", + "net.ipv6.conf.eth0.accept_ra", "ipv6.conf.accept_ra=2", + "net.ipv6.conf.eth0.forwarding", "ipv6.conf.forwarding=1", + }, + }, + { + name: "migrate nothing", + networkMode: "mynet", + sysctls: map[string]string{ + "net.ipv6.conf.all.disable_ipv6": "0", + }, + expSysctls: map[string]string{ + "net.ipv6.conf.all.disable_ipv6": "0", + }, + }, + } + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + hostCfg := &container.HostConfig{ + NetworkMode: container.NetworkMode(tc.networkMode), + Sysctls: map[string]string{}, + } + for k, v := range tc.sysctls { + hostCfg.Sysctls[k] = v + } + netCfg := &network.NetworkingConfig{} + + warnings, err := handleSysctlBC(hostCfg, netCfg, "1.45") + + if tc.expError == "" { + assert.Check(t, err) + } else { + assert.Check(t, is.ErrorContains(err, tc.expError)) + } + + for _, s := range tc.expWarningContains { + assert.Check(t, is.Contains(warnings, s)) + } + + assert.Check(t, is.DeepEqual(hostCfg.Sysctls, tc.expSysctls)) + + ep := netCfg.EndpointsConfig[tc.networkMode] + if ep == nil { + assert.Check(t, is.Nil(tc.expEpSysctls)) + } else { + got, ok := ep.DriverOpts[netlabel.EndpointSysctls] + assert.Check(t, ok) + // Check for expected ep-sysctls. + for _, want := range tc.expEpSysctls { + assert.Check(t, is.Contains(got, want)) + } + // Check for unexpected ep-sysctls. + assert.Check(t, is.Len(got, len(strings.Join(tc.expEpSysctls, ",")))) + } + }) + } +} diff --git a/daemon/container_operations.go b/daemon/container_operations.go index 48235ddd7d..f31832e7c6 100644 --- a/daemon/container_operations.go +++ b/daemon/container_operations.go @@ -614,6 +614,18 @@ func validateEndpointSettings(nw *libnetwork.Network, nwName string, epConfig *n } } + if sysctls, ok := epConfig.DriverOpts[netlabel.EndpointSysctls]; ok { + for _, sysctl := range strings.Split(sysctls, ",") { + scname := strings.SplitN(sysctl, ".", 3) + if len(scname) != 3 || (scname[0] != "ipv4" && scname[0] != "ipv6") { + errs = append(errs, + fmt.Errorf( + "unrecognised network interface sysctl '%s'; represent 'net.X.Y.ethN.Z=V' as 'X.Y.Z=V', 'X' must be 'ipv4' or 'ipv6'", + sysctl)) + } + } + } + if err := multierror.Join(errs...); err != nil { return fmt.Errorf("invalid endpoint settings:\n%w", err) } diff --git a/integration/networking/bridge_test.go b/integration/networking/bridge_test.go index b5eb0fdce0..d824dac1fc 100644 --- a/integration/networking/bridge_test.go +++ b/integration/networking/bridge_test.go @@ -9,8 +9,10 @@ import ( "github.com/docker/docker/api/types" containertypes "github.com/docker/docker/api/types/container" + apinetwork "github.com/docker/docker/api/types/network" "github.com/docker/docker/integration/internal/container" "github.com/docker/docker/integration/internal/network" + "github.com/docker/docker/libnetwork/netlabel" "github.com/docker/docker/testutil" "github.com/docker/docker/testutil/daemon" "github.com/google/go-cmp/cmp/cmpopts" @@ -703,3 +705,31 @@ func TestSetInterfaceSysctl(t *testing.T) { stdout := runRes.Stdout.String() assert.Check(t, is.Contains(stdout, scName)) } + +// Test that it's possible to set a sysctl on an interface in the container +// using DriverOpts. +func TestSetEndpointSysctl(t *testing.T) { + skip.If(t, testEnv.DaemonInfo.OSType == "windows", "no sysctl on Windows") + + ctx := setupTest(t) + d := daemon.New(t) + d.StartWithBusybox(ctx, t) + defer d.Stop(t) + + c := d.NewClientT(t) + defer c.Close() + + const scName = "net.ipv4.conf.eth0.forwarding" + runRes := container.RunAttach(ctx, t, c, + container.WithCmd("sysctl", scName), + container.WithEndpointSettings(apinetwork.NetworkBridge, &apinetwork.EndpointSettings{ + DriverOpts: map[string]string{ + netlabel.EndpointSysctls: "ipv4.conf.forwarding=1", + }, + }), + ) + defer c.ContainerRemove(ctx, runRes.ContainerID, containertypes.RemoveOptions{Force: true}) + + stdout := runRes.Stdout.String() + assert.Check(t, is.Contains(stdout, scName)) +} diff --git a/libnetwork/endpoint.go b/libnetwork/endpoint.go index 9c1b7eee15..07a31929f7 100644 --- a/libnetwork/endpoint.go +++ b/libnetwork/endpoint.go @@ -8,6 +8,7 @@ import ( "encoding/json" "fmt" "net" + "strings" "sync" "github.com/containerd/log" @@ -400,6 +401,18 @@ func (ep *Endpoint) Value() []byte { return b } +func (ep *Endpoint) getSysctls() []string { + ep.mu.Lock() + defer ep.mu.Unlock() + + if s, ok := ep.generic[netlabel.EndpointSysctls]; ok { + if ss, ok := s.(string); ok { + return strings.Split(ss, ",") + } + } + return nil +} + func (ep *Endpoint) SetValue(value []byte) error { return json.Unmarshal(value, ep) } diff --git a/libnetwork/netlabel/labels.go b/libnetwork/netlabel/labels.go index 91232af6aa..817e9eec2c 100644 --- a/libnetwork/netlabel/labels.go +++ b/libnetwork/netlabel/labels.go @@ -26,6 +26,9 @@ const ( // DNSServers A list of DNS servers associated with the endpoint DNSServers = Prefix + ".endpoint.dnsservers" + // EndpointSysctls is a comma separated list of shortened sysctls ('net.X.Y.ethN.Z=V' -> 'X.Y.Z=V'). + EndpointSysctls = Prefix + ".endpoint.sysctls" + // EnableIPv6 constant represents enabling IPV6 at network level EnableIPv6 = Prefix + ".enable_ipv6" diff --git a/libnetwork/osl/interface_linux.go b/libnetwork/osl/interface_linux.go index e87efbaa39..ebc571e76b 100644 --- a/libnetwork/osl/interface_linux.go +++ b/libnetwork/osl/interface_linux.go @@ -4,12 +4,16 @@ import ( "context" "fmt" "net" + "os" + "path/filepath" + "strings" "syscall" "time" "github.com/containerd/log" "github.com/docker/docker/libnetwork/ns" "github.com/docker/docker/libnetwork/types" + "github.com/pkg/errors" "github.com/vishvananda/netlink" "github.com/vishvananda/netns" ) @@ -55,6 +59,7 @@ type Interface struct { llAddrs []*net.IPNet routes []*net.IPNet bridge bool + sysctls []string ns *Namespace } @@ -223,7 +228,7 @@ func (n *Namespace) AddInterface(srcName, dstPrefix string, options ...IfaceOpti } // Configure the interface now this is moved in the proper namespace. - if err := configureInterface(nlh, iface, i); err != nil { + if err := n.configureInterface(nlh, iface, i); err != nil { // If configuring the device fails move it back to the host namespace // and change the name back to the source name. This allows the caller // to properly cleanup the interface. Its important especially for @@ -312,7 +317,7 @@ func (n *Namespace) RemoveInterface(i *Interface) error { return nil } -func configureInterface(nlh *netlink.Handle, iface netlink.Link, i *Interface) error { +func (n *Namespace) configureInterface(nlh *netlink.Handle, iface netlink.Link, i *Interface) error { ifaceName := iface.Attrs().Name ifaceConfigurators := []struct { Fn func(*netlink.Handle, netlink.Link, *Interface) error @@ -331,6 +336,11 @@ func configureInterface(nlh *netlink.Handle, iface netlink.Link, i *Interface) e return fmt.Errorf("%s: %v", config.ErrMessage, err) } } + + if err := n.setSysctls(i.dstName, i.sysctls); err != nil { + return err + } + return nil } @@ -386,6 +396,43 @@ func setInterfaceLinkLocalIPs(nlh *netlink.Handle, iface netlink.Link, i *Interf return nil } +func (n *Namespace) setSysctls(ifName string, sysctls []string) error { + for _, sc := range sysctls { + k, v, found := strings.Cut(sc, "=") + if !found { + return fmt.Errorf("expected sysctl '%s' to have format name=value", sc) + } + sk := strings.Split(k, ".") + if len(sk) != 3 { + return fmt.Errorf("expected sysctl '%s' to have format X.Y.Z, to map to net.X.Y.%s.Z", sc, ifName) + } + + scPath := []string{"/proc/sys/net", sk[0], sk[1], ifName} + scPath = append(scPath, sk[2:]...) + sysPath := filepath.Join(scPath...) + + errC := make(chan error, 1) + err := n.InvokeFunc(func() { + if fi, err := os.Stat(sysPath); err != nil || !fi.Mode().IsRegular() { + errC <- fmt.Errorf("%s is not a sysctl file", sysPath) + } else if err := os.WriteFile(sysPath, []byte(v), 0o644); err != nil { + errC <- err + } + close(errC) + }) + writeErr := <-errC + + if err != nil { + return errors.Wrapf(err, "failed to run sysctl setter in network namespace") + } + if writeErr != nil { + return errors.Wrapf(writeErr, "unable to write to '%s' (derived from '%s', use format X.Y.Z to map to net.X.Y.%s.Z)", + sysPath, k, ifName) + } + } + return nil +} + func setInterfaceName(nlh *netlink.Handle, iface netlink.Link, i *Interface) error { return nlh.LinkSetName(iface, i.DstName()) } diff --git a/libnetwork/osl/options_linux.go b/libnetwork/osl/options_linux.go index bc617c7b0a..f2e668f242 100644 --- a/libnetwork/osl/options_linux.go +++ b/libnetwork/osl/options_linux.go @@ -81,3 +81,11 @@ func WithRoutes(routes []*net.IPNet) IfaceOption { return nil } } + +// WithSysctls sets the interface sysctls. +func WithSysctls(sysctls []string) IfaceOption { + return func(i *Interface) error { + i.sysctls = sysctls + return nil + } +} diff --git a/libnetwork/sandbox_linux.go b/libnetwork/sandbox_linux.go index 7db5edf7c6..ae17be839b 100644 --- a/libnetwork/sandbox_linux.go +++ b/libnetwork/sandbox_linux.go @@ -312,6 +312,9 @@ func (sb *Sandbox) populateNetworkResources(ep *Endpoint) error { if i.mac != nil { ifaceOptions = append(ifaceOptions, osl.WithMACAddress(i.mac)) } + if sysctls := ep.getSysctls(); len(sysctls) > 0 { + ifaceOptions = append(ifaceOptions, osl.WithSysctls(sysctls)) + } if err := sb.osSbox.AddInterface(i.srcName, i.dstPrefix, ifaceOptions...); err != nil { return fmt.Errorf("failed to add interface %s to sandbox: %v", i.srcName, err)