Merge pull request #44491 from corhere/libnetwork-minus-reexec
libnetwork: eliminate almost all reexecs
This commit is contained in:
commit
483b03562a
8 changed files with 233 additions and 489 deletions
|
@ -8,7 +8,6 @@ import (
|
|||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
|
@ -23,7 +22,6 @@ import (
|
|||
"github.com/docker/docker/libnetwork/osl"
|
||||
"github.com/docker/docker/libnetwork/resolvconf"
|
||||
"github.com/docker/docker/libnetwork/types"
|
||||
"github.com/docker/docker/pkg/reexec"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/vishvananda/netlink"
|
||||
"github.com/vishvananda/netlink/nl"
|
||||
|
@ -75,67 +73,15 @@ type network struct {
|
|||
}
|
||||
|
||||
func init() {
|
||||
reexec.Register("set-default-vlan", setDefaultVlan)
|
||||
|
||||
// Lock main() to the initial thread to exclude the goroutines executing
|
||||
// func (*network).watchMiss() from being scheduled onto that thread.
|
||||
// Changes to the network namespace of the initial thread alter
|
||||
// /proc/self/ns/net, which would break any code which (incorrectly)
|
||||
// assumes that that file is a handle to the network namespace for the
|
||||
// thread it is currently executing on.
|
||||
// func (*network).watchMiss() or func setDefaultVLAN() from being
|
||||
// scheduled onto that thread. Changes to the network namespace of the
|
||||
// initial thread alter /proc/self/ns/net, which would break any code
|
||||
// which (incorrectly) assumes that that file is a handle to the network
|
||||
// namespace for the thread it is currently executing on.
|
||||
runtime.LockOSThread()
|
||||
}
|
||||
|
||||
func setDefaultVlan() {
|
||||
if len(os.Args) < 3 {
|
||||
logrus.Error("insufficient number of arguments")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
nsPath := os.Args[1]
|
||||
ns, err := netns.GetFromPath(nsPath)
|
||||
if err != nil {
|
||||
logrus.Errorf("overlay namespace get failed, %v", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
if err = netns.Set(ns); err != nil {
|
||||
logrus.Errorf("setting into overlay namespace failed, %v", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// make sure the sysfs mount doesn't propagate back
|
||||
if err = unix.Unshare(unix.CLONE_NEWNS); err != nil {
|
||||
logrus.Errorf("unshare failed, %v", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
flag := unix.MS_PRIVATE | unix.MS_REC
|
||||
if err = unix.Mount("", "/", "", uintptr(flag), ""); err != nil {
|
||||
logrus.Errorf("root mount failed, %v", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if err = unix.Mount("sysfs", "/sys", "sysfs", 0, ""); err != nil {
|
||||
logrus.Errorf("mounting sysfs failed, %v", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
brName := os.Args[2]
|
||||
// IFLA_BR_VLAN_DEFAULT_PVID was added in Linux v4.4 (see torvalds/linux@0f963b7), so we can't use netlink for
|
||||
// setting this until Docker drops support for CentOS/RHEL 7 (kernel 3.10, eol date: 2024-06-30).
|
||||
path := filepath.Join("/sys/class/net", brName, "bridge/default_pvid")
|
||||
data := []byte{'0', '\n'}
|
||||
|
||||
if err = os.WriteFile(path, data, 0644); err != nil {
|
||||
logrus.Errorf("enabling default vlan on bridge %s failed %v", brName, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
func (d *driver) NetworkAllocate(id string, option map[string]string, ipV4Data, ipV6Data []driverapi.IPAMData) (map[string]string, error) {
|
||||
return nil, types.NotImplementedErrorf("not implemented")
|
||||
}
|
||||
|
@ -641,32 +587,66 @@ func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error
|
|||
return fmt.Errorf("vxlan interface creation failed for subnet %q: %v", s.subnetIP.String(), err)
|
||||
}
|
||||
|
||||
if !hostMode {
|
||||
var name string
|
||||
for _, i := range sbox.Info().Interfaces() {
|
||||
if i.Bridge() {
|
||||
name = i.DstName()
|
||||
}
|
||||
}
|
||||
cmd := &exec.Cmd{
|
||||
Path: reexec.Self(),
|
||||
Args: []string{"set-default-vlan", sbox.Key(), name},
|
||||
Stdout: os.Stdout,
|
||||
Stderr: os.Stderr,
|
||||
}
|
||||
if err := cmd.Run(); err != nil {
|
||||
// not a fatal error
|
||||
logrus.Errorf("reexec to set bridge default vlan failed %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
if hostMode {
|
||||
if err := addFilters(n.id[:12], brName); err != nil {
|
||||
return err
|
||||
return addFilters(n.id[:12], brName)
|
||||
}
|
||||
|
||||
if err := setDefaultVLAN(sbox); err != nil {
|
||||
// not a fatal error
|
||||
logrus.WithError(err).Error("set bridge default vlan failed")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func setDefaultVLAN(sbox osl.Sandbox) error {
|
||||
var brName string
|
||||
for _, i := range sbox.Info().Interfaces() {
|
||||
if i.Bridge() {
|
||||
brName = i.DstName()
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
// IFLA_BR_VLAN_DEFAULT_PVID was added in Linux v4.4 (see torvalds/linux@0f963b7), so we can't use netlink for
|
||||
// setting this until Docker drops support for CentOS/RHEL 7 (kernel 3.10, eol date: 2024-06-30).
|
||||
var innerErr error
|
||||
err := sbox.InvokeFunc(func() {
|
||||
// Contrary to what the sysfs(5) man page says, the entries of /sys/class/net
|
||||
// represent the networking devices visible in the network namespace of the
|
||||
// process which mounted the sysfs filesystem, irrespective of the network
|
||||
// namespace of the process accessing the directory. Remount sysfs in order to
|
||||
// see the network devices in sbox's network namespace, making sure the mount
|
||||
// doesn't propagate back.
|
||||
//
|
||||
// The Linux implementation of (osl.Sandbox).InvokeFunc() runs the function in a
|
||||
// dedicated goroutine. The effects of unshare(CLONE_NEWNS) on a thread cannot
|
||||
// be reverted so the thread needs to be terminated once the goroutine is
|
||||
// finished.
|
||||
runtime.LockOSThread()
|
||||
if err := unix.Unshare(unix.CLONE_NEWNS); err != nil {
|
||||
innerErr = os.NewSyscallError("unshare", err)
|
||||
return
|
||||
}
|
||||
if err := unix.Mount("", "/", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
|
||||
innerErr = &os.PathError{Op: "mount", Path: "/", Err: err}
|
||||
return
|
||||
}
|
||||
if err := unix.Mount("sysfs", "/sys", "sysfs", 0, ""); err != nil {
|
||||
innerErr = &os.PathError{Op: "mount", Path: "/sys", Err: err}
|
||||
return
|
||||
}
|
||||
|
||||
path := filepath.Join("/sys/class/net", brName, "bridge/default_pvid")
|
||||
data := []byte{'0', '\n'}
|
||||
|
||||
if err := os.WriteFile(path, data, 0o644); err != nil {
|
||||
innerErr = fmt.Errorf("failed to enable default vlan on bridge %s: %w", brName, err)
|
||||
return
|
||||
}
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return innerErr
|
||||
}
|
||||
|
||||
// Must be called with the network lock
|
||||
|
|
|
@ -50,7 +50,7 @@ func (n *network) startResolver() {
|
|||
for _, subnet := range hnsresponse.Subnets {
|
||||
if subnet.GatewayAddress != "" {
|
||||
for i := 0; i < 3; i++ {
|
||||
resolver := NewResolver(subnet.GatewayAddress, false, "", n)
|
||||
resolver := NewResolver(subnet.GatewayAddress, false, n)
|
||||
logrus.Debugf("Binding a resolver on network %s gateway %s", n.Name(), subnet.GatewayAddress)
|
||||
executeInCompartment(hnsresponse.DNSServerCompartment, resolver.SetupFunc(53))
|
||||
|
||||
|
|
|
@ -5,7 +5,6 @@ import (
|
|||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
|
@ -14,10 +13,10 @@ import (
|
|||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/docker/docker/internal/unshare"
|
||||
"github.com/docker/docker/libnetwork/ns"
|
||||
"github.com/docker/docker/libnetwork/osl/kernel"
|
||||
"github.com/docker/docker/libnetwork/types"
|
||||
"github.com/docker/docker/pkg/reexec"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/vishvananda/netlink"
|
||||
"github.com/vishvananda/netns"
|
||||
|
@ -27,13 +26,11 @@ import (
|
|||
const defaultPrefix = "/var/run/docker"
|
||||
|
||||
func init() {
|
||||
reexec.Register("set-ipv6", reexecSetIPv6)
|
||||
|
||||
// Lock main() to the initial thread to exclude the goroutines spawned
|
||||
// by func (*networkNamespace) InvokeFunc() from being scheduled onto
|
||||
// that thread. Changes to the network namespace of the initial thread
|
||||
// alter /proc/self/ns/net, which would break any code which
|
||||
// (incorrectly) assumes that that file is a handle to the network
|
||||
// by func (*networkNamespace) InvokeFunc() or func setIPv6() below from
|
||||
// being scheduled onto that thread. Changes to the network namespace of
|
||||
// the initial thread alter /proc/self/ns/net, which would break any
|
||||
// code which (incorrectly) assumes that that file is the network
|
||||
// namespace for the thread it is currently executing on.
|
||||
runtime.LockOSThread()
|
||||
}
|
||||
|
@ -70,10 +67,6 @@ func SetBasePath(path string) {
|
|||
prefix = path
|
||||
}
|
||||
|
||||
func init() {
|
||||
reexec.Register("netns-create", reexecCreateNamespace)
|
||||
}
|
||||
|
||||
func basePath() string {
|
||||
return filepath.Join(prefix, "netns")
|
||||
}
|
||||
|
@ -301,35 +294,18 @@ func GetSandboxForExternalKey(basePath string, key string) (Sandbox, error) {
|
|||
return n, nil
|
||||
}
|
||||
|
||||
func reexecCreateNamespace() {
|
||||
if len(os.Args) < 2 {
|
||||
logrus.Fatal("no namespace path provided")
|
||||
}
|
||||
if err := mountNetworkNamespace("/proc/self/ns/net", os.Args[1]); err != nil {
|
||||
logrus.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func createNetworkNamespace(path string, osCreate bool) error {
|
||||
if err := createNamespaceFile(path); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
cmd := &exec.Cmd{
|
||||
Path: reexec.Self(),
|
||||
Args: append([]string{"netns-create"}, path),
|
||||
Stdout: os.Stdout,
|
||||
Stderr: os.Stderr,
|
||||
do := func() error {
|
||||
return mountNetworkNamespace(fmt.Sprintf("/proc/self/task/%d/ns/net", unix.Gettid()), path)
|
||||
}
|
||||
if osCreate {
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{}
|
||||
cmd.SysProcAttr.Cloneflags = syscall.CLONE_NEWNET
|
||||
return unshare.Go(unix.CLONE_NEWNET, do, nil)
|
||||
}
|
||||
if err := cmd.Run(); err != nil {
|
||||
return fmt.Errorf("namespace creation reexec command failed: %v", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
return do()
|
||||
}
|
||||
|
||||
func unmountNamespaceFile(path string) {
|
||||
|
@ -623,66 +599,65 @@ func (n *networkNamespace) checkLoV6() {
|
|||
n.loV6Enabled = enable
|
||||
}
|
||||
|
||||
func reexecSetIPv6() {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
if len(os.Args) < 3 {
|
||||
logrus.Errorf("invalid number of arguments for %s", os.Args[0])
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
ns, err := netns.GetFromPath(os.Args[1])
|
||||
func setIPv6(nspath, iface string, enable bool) error {
|
||||
origns, err := netns.Get()
|
||||
if err != nil {
|
||||
logrus.Errorf("failed get network namespace %q: %v", os.Args[1], err)
|
||||
os.Exit(2)
|
||||
return fmt.Errorf("failed to get current network namespace: %w", err)
|
||||
}
|
||||
defer origns.Close()
|
||||
|
||||
ns, err := netns.GetFromPath(nspath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed get network namespace %q: %w", nspath, err)
|
||||
}
|
||||
defer ns.Close()
|
||||
|
||||
if err = netns.Set(ns); err != nil {
|
||||
logrus.Errorf("setting into container netns %q failed: %v", os.Args[1], err)
|
||||
os.Exit(3)
|
||||
}
|
||||
errCh := make(chan error, 1)
|
||||
go func() {
|
||||
defer close(errCh)
|
||||
|
||||
var (
|
||||
action = "disable"
|
||||
value = byte('1')
|
||||
path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", os.Args[2])
|
||||
)
|
||||
|
||||
if os.Args[3] == "true" {
|
||||
action = "enable"
|
||||
value = byte('0')
|
||||
}
|
||||
|
||||
if _, err := os.Stat(path); err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
logrus.Warnf("file does not exist: %s : %v Has IPv6 been disabled in this node's kernel?", path, err)
|
||||
os.Exit(0)
|
||||
runtime.LockOSThread()
|
||||
if err = netns.Set(ns); err != nil {
|
||||
errCh <- fmt.Errorf("setting into container netns %q failed: %w", nspath, err)
|
||||
return
|
||||
}
|
||||
logrus.Errorf("failed to stat %s : %v", path, err)
|
||||
os.Exit(5)
|
||||
}
|
||||
defer func() {
|
||||
if err := netns.Set(origns); err != nil {
|
||||
logrus.WithError(err).Error("libnetwork: restoring thread network namespace failed")
|
||||
// The error is only fatal for the current thread. Keep this
|
||||
// goroutine locked to the thread to make the runtime replace it
|
||||
// with a clean thread once this goroutine returns.
|
||||
} else {
|
||||
runtime.UnlockOSThread()
|
||||
}
|
||||
}()
|
||||
|
||||
if err = os.WriteFile(path, []byte{value, '\n'}, 0644); err != nil {
|
||||
logrus.Errorf("failed to %s IPv6 forwarding for container's interface %s: %v", action, os.Args[2], err)
|
||||
os.Exit(4)
|
||||
}
|
||||
var (
|
||||
action = "disable"
|
||||
value = byte('1')
|
||||
path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", iface)
|
||||
)
|
||||
|
||||
os.Exit(0)
|
||||
}
|
||||
if enable {
|
||||
action = "enable"
|
||||
value = '0'
|
||||
}
|
||||
|
||||
func setIPv6(path, iface string, enable bool) error {
|
||||
cmd := &exec.Cmd{
|
||||
Path: reexec.Self(),
|
||||
Args: append([]string{"set-ipv6"}, path, iface, strconv.FormatBool(enable)),
|
||||
Stdout: os.Stdout,
|
||||
Stderr: os.Stderr,
|
||||
}
|
||||
if err := cmd.Run(); err != nil {
|
||||
return fmt.Errorf("reexec to set IPv6 failed: %v", err)
|
||||
}
|
||||
return nil
|
||||
if _, err := os.Stat(path); err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
logrus.WithError(err).Warn("Cannot configure IPv6 forwarding on container interface. Has IPv6 been disabled in this node's kernel?")
|
||||
return
|
||||
}
|
||||
errCh <- err
|
||||
return
|
||||
}
|
||||
|
||||
if err = os.WriteFile(path, []byte{value, '\n'}, 0o644); err != nil {
|
||||
errCh <- fmt.Errorf("failed to %s IPv6 forwarding for container's interface %s: %w", action, iface, err)
|
||||
return
|
||||
}
|
||||
}()
|
||||
return <-errCh
|
||||
}
|
||||
|
||||
// ApplyOSTweaks applies linux configs on the sandbox
|
||||
|
|
|
@ -90,7 +90,6 @@ type resolver struct {
|
|||
queryLock sync.Mutex
|
||||
listenAddress string
|
||||
proxyDNS bool
|
||||
resolverKey string
|
||||
startCh chan struct{}
|
||||
}
|
||||
|
||||
|
@ -99,12 +98,11 @@ func init() {
|
|||
}
|
||||
|
||||
// NewResolver creates a new instance of the Resolver
|
||||
func NewResolver(address string, proxyDNS bool, resolverKey string, backend DNSBackend) Resolver {
|
||||
func NewResolver(address string, proxyDNS bool, backend DNSBackend) Resolver {
|
||||
return &resolver{
|
||||
backend: backend,
|
||||
proxyDNS: proxyDNS,
|
||||
listenAddress: address,
|
||||
resolverKey: resolverKey,
|
||||
err: fmt.Errorf("setup not done yet"),
|
||||
startCh: make(chan struct{}, 1),
|
||||
}
|
||||
|
|
|
@ -122,7 +122,7 @@ func TestDNSIPQuery(t *testing.T) {
|
|||
|
||||
w := new(tstwriter)
|
||||
// the unit tests right now will focus on non-proxyed DNS requests
|
||||
r := NewResolver(resolverIPSandbox, false, sb.Key(), sb.(*sandbox))
|
||||
r := NewResolver(resolverIPSandbox, false, sb.(*sandbox))
|
||||
|
||||
// test name1's IP is resolved correctly with the default A type query
|
||||
// Also make sure DNS lookups are case insensitive
|
||||
|
@ -266,7 +266,7 @@ func TestDNSProxyServFail(t *testing.T) {
|
|||
t.Log("DNS Server can be reached")
|
||||
|
||||
w := new(tstwriter)
|
||||
r := NewResolver(resolverIPSandbox, true, sb.Key(), sb.(*sandbox))
|
||||
r := NewResolver(resolverIPSandbox, true, sb.(*sandbox))
|
||||
q := new(dns.Msg)
|
||||
q.SetQuestion("name1.", dns.TypeA)
|
||||
|
||||
|
|
|
@ -4,22 +4,12 @@
|
|||
package libnetwork
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"os"
|
||||
"os/exec"
|
||||
"runtime"
|
||||
|
||||
"github.com/docker/docker/libnetwork/iptables"
|
||||
"github.com/docker/docker/pkg/reexec"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/vishvananda/netns"
|
||||
)
|
||||
|
||||
func init() {
|
||||
reexec.Register("setup-resolver", reexecSetupResolver)
|
||||
}
|
||||
|
||||
const (
|
||||
// outputChain used for docker embed dns
|
||||
outputChain = "DOCKER_OUTPUT"
|
||||
|
@ -27,79 +17,46 @@ const (
|
|||
postroutingchain = "DOCKER_POSTROUTING"
|
||||
)
|
||||
|
||||
func reexecSetupResolver() {
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
if len(os.Args) < 4 {
|
||||
logrus.Error("invalid number of arguments..")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
resolverIP, ipPort, _ := net.SplitHostPort(os.Args[2])
|
||||
_, tcpPort, _ := net.SplitHostPort(os.Args[3])
|
||||
rules := [][]string{
|
||||
{"-t", "nat", "-I", outputChain, "-d", resolverIP, "-p", "udp", "--dport", dnsPort, "-j", "DNAT", "--to-destination", os.Args[2]},
|
||||
{"-t", "nat", "-I", postroutingchain, "-s", resolverIP, "-p", "udp", "--sport", ipPort, "-j", "SNAT", "--to-source", ":" + dnsPort},
|
||||
{"-t", "nat", "-I", outputChain, "-d", resolverIP, "-p", "tcp", "--dport", dnsPort, "-j", "DNAT", "--to-destination", os.Args[3]},
|
||||
{"-t", "nat", "-I", postroutingchain, "-s", resolverIP, "-p", "tcp", "--sport", tcpPort, "-j", "SNAT", "--to-source", ":" + dnsPort},
|
||||
}
|
||||
|
||||
f, err := os.OpenFile(os.Args[1], os.O_RDONLY, 0)
|
||||
if err != nil {
|
||||
logrus.Errorf("failed get network namespace %q: %v", os.Args[1], err)
|
||||
os.Exit(2)
|
||||
}
|
||||
defer f.Close() //nolint:gosec
|
||||
|
||||
nsFD := f.Fd()
|
||||
if err = netns.Set(netns.NsHandle(nsFD)); err != nil {
|
||||
logrus.Errorf("setting into container net ns %v failed, %v", os.Args[1], err)
|
||||
os.Exit(3)
|
||||
}
|
||||
|
||||
// TODO IPv6 support
|
||||
iptable := iptables.GetIptable(iptables.IPv4)
|
||||
|
||||
// insert outputChain and postroutingchain
|
||||
err = iptable.RawCombinedOutputNative("-t", "nat", "-C", "OUTPUT", "-d", resolverIP, "-j", outputChain)
|
||||
if err == nil {
|
||||
iptable.RawCombinedOutputNative("-t", "nat", "-F", outputChain)
|
||||
} else {
|
||||
iptable.RawCombinedOutputNative("-t", "nat", "-N", outputChain)
|
||||
iptable.RawCombinedOutputNative("-t", "nat", "-I", "OUTPUT", "-d", resolverIP, "-j", outputChain)
|
||||
}
|
||||
|
||||
err = iptable.RawCombinedOutputNative("-t", "nat", "-C", "POSTROUTING", "-d", resolverIP, "-j", postroutingchain)
|
||||
if err == nil {
|
||||
iptable.RawCombinedOutputNative("-t", "nat", "-F", postroutingchain)
|
||||
} else {
|
||||
iptable.RawCombinedOutputNative("-t", "nat", "-N", postroutingchain)
|
||||
iptable.RawCombinedOutputNative("-t", "nat", "-I", "POSTROUTING", "-d", resolverIP, "-j", postroutingchain)
|
||||
}
|
||||
|
||||
for _, rule := range rules {
|
||||
if iptable.RawCombinedOutputNative(rule...) != nil {
|
||||
logrus.Errorf("set up rule failed, %v", rule)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (r *resolver) setupIPTable() error {
|
||||
if r.err != nil {
|
||||
return r.err
|
||||
}
|
||||
laddr := r.conn.LocalAddr().String()
|
||||
ltcpaddr := r.tcpListen.Addr().String()
|
||||
resolverIP, ipPort, _ := net.SplitHostPort(laddr)
|
||||
_, tcpPort, _ := net.SplitHostPort(ltcpaddr)
|
||||
rules := [][]string{
|
||||
{"-t", "nat", "-I", outputChain, "-d", resolverIP, "-p", "udp", "--dport", dnsPort, "-j", "DNAT", "--to-destination", laddr},
|
||||
{"-t", "nat", "-I", postroutingchain, "-s", resolverIP, "-p", "udp", "--sport", ipPort, "-j", "SNAT", "--to-source", ":" + dnsPort},
|
||||
{"-t", "nat", "-I", outputChain, "-d", resolverIP, "-p", "tcp", "--dport", dnsPort, "-j", "DNAT", "--to-destination", ltcpaddr},
|
||||
{"-t", "nat", "-I", postroutingchain, "-s", resolverIP, "-p", "tcp", "--sport", tcpPort, "-j", "SNAT", "--to-source", ":" + dnsPort},
|
||||
}
|
||||
|
||||
cmd := &exec.Cmd{
|
||||
Path: reexec.Self(),
|
||||
Args: append([]string{"setup-resolver"}, r.resolverKey, laddr, ltcpaddr),
|
||||
Stdout: os.Stdout,
|
||||
Stderr: os.Stderr,
|
||||
}
|
||||
if err := cmd.Run(); err != nil {
|
||||
return fmt.Errorf("reexec failed: %v", err)
|
||||
}
|
||||
return nil
|
||||
return r.backend.ExecFunc(func() {
|
||||
// TODO IPv6 support
|
||||
iptable := iptables.GetIptable(iptables.IPv4)
|
||||
|
||||
// insert outputChain and postroutingchain
|
||||
err := iptable.RawCombinedOutputNative("-t", "nat", "-C", "OUTPUT", "-d", resolverIP, "-j", outputChain)
|
||||
if err == nil {
|
||||
iptable.RawCombinedOutputNative("-t", "nat", "-F", outputChain)
|
||||
} else {
|
||||
iptable.RawCombinedOutputNative("-t", "nat", "-N", outputChain)
|
||||
iptable.RawCombinedOutputNative("-t", "nat", "-I", "OUTPUT", "-d", resolverIP, "-j", outputChain)
|
||||
}
|
||||
|
||||
err = iptable.RawCombinedOutputNative("-t", "nat", "-C", "POSTROUTING", "-d", resolverIP, "-j", postroutingchain)
|
||||
if err == nil {
|
||||
iptable.RawCombinedOutputNative("-t", "nat", "-F", postroutingchain)
|
||||
} else {
|
||||
iptable.RawCombinedOutputNative("-t", "nat", "-N", postroutingchain)
|
||||
iptable.RawCombinedOutputNative("-t", "nat", "-I", "POSTROUTING", "-d", resolverIP, "-j", postroutingchain)
|
||||
}
|
||||
|
||||
for _, rule := range rules {
|
||||
if iptable.RawCombinedOutputNative(rule...) != nil {
|
||||
logrus.Errorf("set up rule failed, %v", rule)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@ const (
|
|||
func (sb *sandbox) startResolver(restore bool) {
|
||||
sb.resolverOnce.Do(func() {
|
||||
var err error
|
||||
sb.resolver = NewResolver(resolverIPSandbox, true, sb.Key(), sb)
|
||||
sb.resolver = NewResolver(resolverIPSandbox, true, sb)
|
||||
defer func() {
|
||||
if err != nil {
|
||||
sb.resolver = nil
|
||||
|
|
|
@ -5,9 +5,7 @@ import (
|
|||
"io"
|
||||
"net"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
|
@ -15,20 +13,12 @@ import (
|
|||
|
||||
"github.com/docker/docker/libnetwork/iptables"
|
||||
"github.com/docker/docker/libnetwork/ns"
|
||||
"github.com/docker/docker/pkg/reexec"
|
||||
"github.com/gogo/protobuf/proto"
|
||||
"github.com/ishidawataru/sctp"
|
||||
"github.com/moby/ipvs"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/vishvananda/netlink/nl"
|
||||
"github.com/vishvananda/netns"
|
||||
)
|
||||
|
||||
func init() {
|
||||
reexec.Register("fwmarker", fwMarker)
|
||||
reexec.Register("redirector", redirector)
|
||||
}
|
||||
|
||||
// Populate all loadbalancers on the network that the passed endpoint
|
||||
// belongs to, into this sandbox.
|
||||
func (sb *sandbox) populateLoadBalancers(ep *endpoint) {
|
||||
|
@ -41,7 +31,7 @@ func (sb *sandbox) populateLoadBalancers(ep *endpoint) {
|
|||
eIP := ep.Iface().Address()
|
||||
|
||||
if n.ingress {
|
||||
if err := addRedirectRules(sb.Key(), eIP, ep.ingressPorts); err != nil {
|
||||
if err := sb.addRedirectRules(eIP, ep.ingressPorts); err != nil {
|
||||
logrus.Errorf("Failed to add redirect rules for ep %s (%.7s): %v", ep.Name(), ep.ID(), err)
|
||||
}
|
||||
}
|
||||
|
@ -141,7 +131,7 @@ func (n *network) addLBBackend(ip net.IP, lb *loadBalancer) {
|
|||
}
|
||||
|
||||
logrus.Debugf("Creating service for vip %s fwMark %d ingressPorts %#v in sbox %.7s (%.7s)", lb.vip, lb.fwMark, lb.service.ingressPorts, sb.ID(), sb.ContainerID())
|
||||
if err := invokeFWMarker(sb.Key(), lb.vip, lb.fwMark, lb.service.ingressPorts, eIP, false, n.loadBalancerMode); err != nil {
|
||||
if err := sb.configureFWMark(lb.vip, lb.fwMark, lb.service.ingressPorts, eIP, false, n.loadBalancerMode); err != nil {
|
||||
logrus.Errorf("Failed to add firewall mark rule in sbox %.7s (%.7s): %v", sb.ID(), sb.ContainerID(), err)
|
||||
return
|
||||
}
|
||||
|
@ -240,7 +230,7 @@ func (n *network) rmLBBackend(ip net.IP, lb *loadBalancer, rmService bool, fullR
|
|||
}
|
||||
}
|
||||
|
||||
if err := invokeFWMarker(sb.Key(), lb.vip, lb.fwMark, lb.service.ingressPorts, eIP, true, n.loadBalancerMode); err != nil {
|
||||
if err := sb.configureFWMark(lb.vip, lb.fwMark, lb.service.ingressPorts, eIP, true, n.loadBalancerMode); err != nil {
|
||||
logrus.Errorf("Failed to delete firewall mark rule in sbox %.7s (%.7s): %v", sb.ID(), sb.ContainerID(), err)
|
||||
}
|
||||
|
||||
|
@ -381,7 +371,7 @@ func programIngress(gwIP net.IP, ingressPorts []*PortConfig, isDelete bool) erro
|
|||
}
|
||||
|
||||
path := filepath.Join("/proc/sys/net/ipv4/conf", oifName, "route_localnet")
|
||||
if err := os.WriteFile(path, []byte{'1', '\n'}, 0644); err != nil { //nolint:gosec // gosec complains about perms here, which must be 0644 in this case
|
||||
if err := os.WriteFile(path, []byte{'1', '\n'}, 0o644); err != nil { //nolint:gosec // gosec complains about perms here, which must be 0644 in this case
|
||||
return fmt.Errorf("could not write to %s: %v", path, err)
|
||||
}
|
||||
|
||||
|
@ -540,216 +530,65 @@ func plumbProxy(iPort *PortConfig, isDelete bool) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func writePortsToFile(ports []*PortConfig) (string, error) {
|
||||
f, err := os.CreateTemp("", "port_configs")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close() //nolint:gosec
|
||||
|
||||
buf, _ := proto.Marshal(&EndpointRecord{
|
||||
IngressPorts: ports,
|
||||
})
|
||||
|
||||
n, err := f.Write(buf)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if n < len(buf) {
|
||||
return "", io.ErrShortWrite
|
||||
}
|
||||
|
||||
return f.Name(), nil
|
||||
}
|
||||
|
||||
func readPortsFromFile(fileName string) ([]*PortConfig, error) {
|
||||
buf, err := os.ReadFile(fileName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var epRec EndpointRecord
|
||||
err = proto.Unmarshal(buf, &epRec)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return epRec.IngressPorts, nil
|
||||
}
|
||||
|
||||
// Invoke fwmarker reexec routine to mark vip destined packets with
|
||||
// the passed firewall mark.
|
||||
func invokeFWMarker(path string, vip net.IP, fwMark uint32, ingressPorts []*PortConfig, eIP *net.IPNet, isDelete bool, lbMode string) error {
|
||||
var ingressPortsFile string
|
||||
|
||||
if len(ingressPorts) != 0 {
|
||||
var err error
|
||||
ingressPortsFile, err = writePortsToFile(ingressPorts)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
defer os.Remove(ingressPortsFile)
|
||||
}
|
||||
// configureFWMark configures the sandbox firewall to mark vip destined packets
|
||||
// with the firewall mark fwMark.
|
||||
func (sb *sandbox) configureFWMark(vip net.IP, fwMark uint32, ingressPorts []*PortConfig, eIP *net.IPNet, isDelete bool, lbMode string) error {
|
||||
// TODO IPv6 support
|
||||
iptable := iptables.GetIptable(iptables.IPv4)
|
||||
|
||||
fwMarkStr := strconv.FormatUint(uint64(fwMark), 10)
|
||||
addDelOpt := "-A"
|
||||
if isDelete {
|
||||
addDelOpt = "-D"
|
||||
}
|
||||
|
||||
cmd := &exec.Cmd{
|
||||
Path: reexec.Self(),
|
||||
Args: append([]string{"fwmarker"}, path, vip.String(), strconv.FormatUint(uint64(fwMark), 10), addDelOpt, ingressPortsFile, eIP.String(), lbMode),
|
||||
Stdout: os.Stdout,
|
||||
Stderr: os.Stderr,
|
||||
}
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
return fmt.Errorf("reexec failed: %v", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Firewall marker reexec function.
|
||||
func fwMarker() {
|
||||
// TODO IPv6 support
|
||||
iptable := iptables.GetIptable(iptables.IPv4)
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
if len(os.Args) < 8 {
|
||||
logrus.Error("invalid number of arguments..")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
var ingressPorts []*PortConfig
|
||||
if os.Args[5] != "" {
|
||||
var err error
|
||||
ingressPorts, err = readPortsFromFile(os.Args[5])
|
||||
if err != nil {
|
||||
logrus.Errorf("Failed reading ingress ports file: %v", err)
|
||||
os.Exit(2)
|
||||
}
|
||||
}
|
||||
|
||||
vip := os.Args[2]
|
||||
fwMark := os.Args[3]
|
||||
if _, err := strconv.ParseUint(fwMark, 10, 32); err != nil {
|
||||
logrus.Errorf("bad fwmark value(%s) passed: %v", fwMark, err)
|
||||
os.Exit(3)
|
||||
}
|
||||
addDelOpt := os.Args[4]
|
||||
|
||||
rules := make([][]string, 0, len(ingressPorts))
|
||||
for _, iPort := range ingressPorts {
|
||||
var (
|
||||
protocol = strings.ToLower(PortConfig_Protocol_name[int32(iPort.Protocol)])
|
||||
publishedPort = strconv.FormatUint(uint64(iPort.PublishedPort), 10)
|
||||
)
|
||||
rule := []string{"-t", "mangle", addDelOpt, "PREROUTING", "-p", protocol, "--dport", publishedPort, "-j", "MARK", "--set-mark", fwMark}
|
||||
rule := []string{"-t", "mangle", addDelOpt, "PREROUTING", "-p", protocol, "--dport", publishedPort, "-j", "MARK", "--set-mark", fwMarkStr}
|
||||
rules = append(rules, rule)
|
||||
}
|
||||
|
||||
ns, err := netns.GetFromPath(os.Args[1])
|
||||
if err != nil {
|
||||
logrus.Errorf("failed get network namespace %q: %v", os.Args[1], err)
|
||||
os.Exit(4)
|
||||
}
|
||||
defer ns.Close()
|
||||
var innerErr error
|
||||
err := sb.ExecFunc(func() {
|
||||
if !isDelete && lbMode == loadBalancerModeNAT {
|
||||
subnet := net.IPNet{IP: eIP.IP.Mask(eIP.Mask), Mask: eIP.Mask}
|
||||
ruleParams := []string{"-m", "ipvs", "--ipvs", "-d", subnet.String(), "-j", "SNAT", "--to-source", eIP.IP.String()}
|
||||
if !iptable.Exists("nat", "POSTROUTING", ruleParams...) {
|
||||
rule := append([]string{"-t", "nat", "-A", "POSTROUTING"}, ruleParams...)
|
||||
rules = append(rules, rule)
|
||||
|
||||
if err := netns.Set(ns); err != nil {
|
||||
logrus.Errorf("setting into container net ns %v failed, %v", os.Args[1], err)
|
||||
os.Exit(5)
|
||||
}
|
||||
|
||||
lbMode := os.Args[7]
|
||||
if addDelOpt == "-A" && lbMode == loadBalancerModeNAT {
|
||||
eIP, subnet, err := net.ParseCIDR(os.Args[6])
|
||||
if err != nil {
|
||||
logrus.Errorf("Failed to parse endpoint IP %s: %v", os.Args[6], err)
|
||||
os.Exit(6)
|
||||
}
|
||||
|
||||
ruleParams := []string{"-m", "ipvs", "--ipvs", "-d", subnet.String(), "-j", "SNAT", "--to-source", eIP.String()}
|
||||
if !iptable.Exists("nat", "POSTROUTING", ruleParams...) {
|
||||
rule := append([]string{"-t", "nat", "-A", "POSTROUTING"}, ruleParams...)
|
||||
rules = append(rules, rule)
|
||||
|
||||
err := os.WriteFile("/proc/sys/net/ipv4/vs/conntrack", []byte{'1', '\n'}, 0644)
|
||||
if err != nil {
|
||||
logrus.Errorf("Failed to write to /proc/sys/net/ipv4/vs/conntrack: %v", err)
|
||||
os.Exit(7)
|
||||
err := os.WriteFile("/proc/sys/net/ipv4/vs/conntrack", []byte{'1', '\n'}, 0644)
|
||||
if err != nil {
|
||||
innerErr = err
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
rule := []string{"-t", "mangle", addDelOpt, "INPUT", "-d", vip + "/32", "-j", "MARK", "--set-mark", fwMark}
|
||||
rules = append(rules, rule)
|
||||
rule := []string{"-t", "mangle", addDelOpt, "INPUT", "-d", vip.String() + "/32", "-j", "MARK", "--set-mark", fwMarkStr}
|
||||
rules = append(rules, rule)
|
||||
|
||||
for _, rule := range rules {
|
||||
if err := iptable.RawCombinedOutputNative(rule...); err != nil {
|
||||
logrus.Errorf("set up rule failed, %v: %v", rule, err)
|
||||
os.Exit(8)
|
||||
for _, rule := range rules {
|
||||
if err := iptable.RawCombinedOutputNative(rule...); err != nil {
|
||||
innerErr = fmt.Errorf("set up rule failed, %v: %w", rule, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return innerErr
|
||||
}
|
||||
|
||||
func addRedirectRules(path string, eIP *net.IPNet, ingressPorts []*PortConfig) error {
|
||||
var ingressPortsFile string
|
||||
|
||||
if len(ingressPorts) != 0 {
|
||||
var err error
|
||||
ingressPortsFile, err = writePortsToFile(ingressPorts)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer os.Remove(ingressPortsFile)
|
||||
}
|
||||
|
||||
cmd := &exec.Cmd{
|
||||
Path: reexec.Self(),
|
||||
Args: append([]string{"redirector"}, path, eIP.String(), ingressPortsFile),
|
||||
Stdout: os.Stdout,
|
||||
Stderr: os.Stderr,
|
||||
}
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
return fmt.Errorf("reexec failed: %v", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Redirector reexec function.
|
||||
func redirector() {
|
||||
func (sb *sandbox) addRedirectRules(eIP *net.IPNet, ingressPorts []*PortConfig) error {
|
||||
// TODO IPv6 support
|
||||
iptable := iptables.GetIptable(iptables.IPv4)
|
||||
runtime.LockOSThread()
|
||||
defer runtime.UnlockOSThread()
|
||||
|
||||
if len(os.Args) < 4 {
|
||||
logrus.Error("invalid number of arguments..")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
var ingressPorts []*PortConfig
|
||||
if os.Args[3] != "" {
|
||||
var err error
|
||||
ingressPorts, err = readPortsFromFile(os.Args[3])
|
||||
if err != nil {
|
||||
logrus.Errorf("Failed reading ingress ports file: %v", err)
|
||||
os.Exit(2)
|
||||
}
|
||||
}
|
||||
|
||||
eIP, _, err := net.ParseCIDR(os.Args[2])
|
||||
if err != nil {
|
||||
logrus.Errorf("Failed to parse endpoint IP %s: %v", os.Args[2], err)
|
||||
os.Exit(3)
|
||||
}
|
||||
ipAddr := eIP.String()
|
||||
ipAddr := eIP.IP.String()
|
||||
|
||||
rules := make([][]string, 0, len(ingressPorts)*3) // 3 rules per port
|
||||
for _, iPort := range ingressPorts {
|
||||
|
@ -770,47 +609,42 @@ func redirector() {
|
|||
)
|
||||
}
|
||||
|
||||
ns, err := netns.GetFromPath(os.Args[1])
|
||||
var innerErr error
|
||||
err := sb.ExecFunc(func() {
|
||||
for _, rule := range rules {
|
||||
if err := iptable.RawCombinedOutputNative(rule...); err != nil {
|
||||
innerErr = fmt.Errorf("set up rule failed, %v: %w", rule, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if len(ingressPorts) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Ensure blocking rules for anything else in/to ingress network
|
||||
for _, rule := range [][]string{
|
||||
{"-d", ipAddr, "-p", "sctp", "-j", "DROP"},
|
||||
{"-d", ipAddr, "-p", "udp", "-j", "DROP"},
|
||||
{"-d", ipAddr, "-p", "tcp", "-j", "DROP"},
|
||||
} {
|
||||
if !iptable.ExistsNative(iptables.Filter, "INPUT", rule...) {
|
||||
if err := iptable.RawCombinedOutputNative(append([]string{"-A", "INPUT"}, rule...)...); err != nil {
|
||||
innerErr = fmt.Errorf("set up rule failed, %v: %w", rule, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
rule[0] = "-s"
|
||||
if !iptable.ExistsNative(iptables.Filter, "OUTPUT", rule...) {
|
||||
if err := iptable.RawCombinedOutputNative(append([]string{"-A", "OUTPUT"}, rule...)...); err != nil {
|
||||
innerErr = fmt.Errorf("set up rule failed, %v: %w", rule, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
if err != nil {
|
||||
logrus.Errorf("failed get network namespace %q: %v", os.Args[1], err)
|
||||
os.Exit(4)
|
||||
}
|
||||
defer ns.Close()
|
||||
|
||||
if err := netns.Set(ns); err != nil {
|
||||
logrus.Errorf("setting into container net ns %v failed, %v", os.Args[1], err)
|
||||
os.Exit(5)
|
||||
}
|
||||
|
||||
for _, rule := range rules {
|
||||
if err := iptable.RawCombinedOutputNative(rule...); err != nil {
|
||||
logrus.Errorf("set up rule failed, %v: %v", rule, err)
|
||||
os.Exit(6)
|
||||
}
|
||||
}
|
||||
|
||||
if len(ingressPorts) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Ensure blocking rules for anything else in/to ingress network
|
||||
for _, rule := range [][]string{
|
||||
{"-d", ipAddr, "-p", "sctp", "-j", "DROP"},
|
||||
{"-d", ipAddr, "-p", "udp", "-j", "DROP"},
|
||||
{"-d", ipAddr, "-p", "tcp", "-j", "DROP"},
|
||||
} {
|
||||
if !iptable.ExistsNative(iptables.Filter, "INPUT", rule...) {
|
||||
if err := iptable.RawCombinedOutputNative(append([]string{"-A", "INPUT"}, rule...)...); err != nil {
|
||||
logrus.Errorf("set up rule failed, %v: %v", rule, err)
|
||||
os.Exit(7)
|
||||
}
|
||||
}
|
||||
rule[0] = "-s"
|
||||
if !iptable.ExistsNative(iptables.Filter, "OUTPUT", rule...) {
|
||||
if err := iptable.RawCombinedOutputNative(append([]string{"-A", "OUTPUT"}, rule...)...); err != nil {
|
||||
logrus.Errorf("set up rule failed, %v: %v", rule, err)
|
||||
os.Exit(8)
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
return innerErr
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue