moby/libnetwork/osl/namespace_linux.go
Sebastiaan van Stijn cff4f20c44
migrate to github.com/containerd/log v0.1.0
The github.com/containerd/containerd/log package was moved to a separate
module, which will also be used by upcoming (patch) releases of containerd.

This patch moves our own uses of the package to use the new module.

Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2023-10-11 17:52:23 +02:00

670 lines
18 KiB
Go

package osl
import (
"context"
"errors"
"fmt"
"net"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"syscall"
"time"
"github.com/containerd/log"
"github.com/docker/docker/internal/unshare"
"github.com/docker/docker/libnetwork/ns"
"github.com/docker/docker/libnetwork/osl/kernel"
"github.com/docker/docker/libnetwork/types"
"github.com/vishvananda/netlink"
"github.com/vishvananda/netns"
"golang.org/x/sys/unix"
)
const defaultPrefix = "/var/run/docker"
func init() {
// Lock main() to the initial thread to exclude the goroutines spawned
// by func (*Namespace) InvokeFunc() or func setIPv6() below from
// being scheduled onto that thread. Changes to the network namespace of
// the initial thread alter /proc/self/ns/net, which would break any
// code which (incorrectly) assumes that that file is the network
// namespace for the thread it is currently executing on.
runtime.LockOSThread()
}
var (
once sync.Once
garbagePathMap = make(map[string]bool)
gpmLock sync.Mutex
gpmWg sync.WaitGroup
gpmCleanupPeriod = 60 * time.Second
gpmChan = make(chan chan struct{})
netnsBasePath = filepath.Join(defaultPrefix, "netns")
)
// SetBasePath sets the base url prefix for the ns path
func SetBasePath(path string) {
netnsBasePath = filepath.Join(path, "netns")
}
func basePath() string {
return netnsBasePath
}
func createBasePath() {
err := os.MkdirAll(basePath(), 0o755)
if err != nil {
panic("Could not create net namespace path directory")
}
// Start the garbage collection go routine
go removeUnusedPaths()
}
func removeUnusedPaths() {
gpmLock.Lock()
period := gpmCleanupPeriod
gpmLock.Unlock()
ticker := time.NewTicker(period)
for {
var (
gc chan struct{}
gcOk bool
)
select {
case <-ticker.C:
case gc, gcOk = <-gpmChan:
}
gpmLock.Lock()
pathList := make([]string, 0, len(garbagePathMap))
for path := range garbagePathMap {
pathList = append(pathList, path)
}
garbagePathMap = make(map[string]bool)
gpmWg.Add(1)
gpmLock.Unlock()
for _, path := range pathList {
os.Remove(path)
}
gpmWg.Done()
if gcOk {
close(gc)
}
}
}
func addToGarbagePaths(path string) {
gpmLock.Lock()
garbagePathMap[path] = true
gpmLock.Unlock()
}
func removeFromGarbagePaths(path string) {
gpmLock.Lock()
delete(garbagePathMap, path)
gpmLock.Unlock()
}
// GC triggers garbage collection of namespace path right away
// and waits for it.
func GC() {
gpmLock.Lock()
if len(garbagePathMap) == 0 {
// No need for GC if map is empty
gpmLock.Unlock()
return
}
gpmLock.Unlock()
// if content exists in the garbage paths
// we can trigger GC to run, providing a
// channel to be notified on completion
waitGC := make(chan struct{})
gpmChan <- waitGC
// wait for GC completion
<-waitGC
}
// GenerateKey generates a sandbox key based on the passed
// container id.
func GenerateKey(containerID string) string {
maxLen := 12
// Read sandbox key from host for overlay
if strings.HasPrefix(containerID, "-") {
var (
index int
indexStr string
tmpkey string
)
dir, err := os.ReadDir(basePath())
if err != nil {
return ""
}
for _, v := range dir {
id := v.Name()
if strings.HasSuffix(id, containerID[:maxLen-1]) {
indexStr = strings.TrimSuffix(id, containerID[:maxLen-1])
tmpindex, err := strconv.Atoi(indexStr)
if err != nil {
return ""
}
if tmpindex > index {
index = tmpindex
tmpkey = id
}
}
}
containerID = tmpkey
if containerID == "" {
return ""
}
}
if len(containerID) < maxLen {
maxLen = len(containerID)
}
return basePath() + "/" + containerID[:maxLen]
}
// NewSandbox provides a new Namespace instance created in an os specific way
// provided a key which uniquely identifies the sandbox.
func NewSandbox(key string, osCreate, isRestore bool) (*Namespace, error) {
if !isRestore {
err := createNetworkNamespace(key, osCreate)
if err != nil {
return nil, err
}
} else {
once.Do(createBasePath)
}
n := &Namespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)}
sboxNs, err := netns.GetFromPath(n.path)
if err != nil {
return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
}
defer sboxNs.Close()
n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
if err != nil {
return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
}
err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
if err != nil {
log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
}
// In live-restore mode, IPV6 entries are getting cleaned up due to below code
// We should retain IPV6 configurations in live-restore mode when Docker Daemon
// comes back. It should work as it is on other cases
// As starting point, disable IPv6 on all interfaces
if !isRestore && !n.isDefault {
err = setIPv6(n.path, "all", false)
if err != nil {
log.G(context.TODO()).Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
}
}
if err = n.loopbackUp(); err != nil {
n.nlHandle.Close()
return nil, err
}
return n, nil
}
func mountNetworkNamespace(basePath string, lnPath string) error {
return syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "")
}
// GetSandboxForExternalKey returns sandbox object for the supplied path
func GetSandboxForExternalKey(basePath string, key string) (*Namespace, error) {
if err := createNamespaceFile(key); err != nil {
return nil, err
}
if err := mountNetworkNamespace(basePath, key); err != nil {
return nil, err
}
n := &Namespace{path: key, nextIfIndex: make(map[string]int)}
sboxNs, err := netns.GetFromPath(n.path)
if err != nil {
return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
}
defer sboxNs.Close()
n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
if err != nil {
return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
}
err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
if err != nil {
log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
}
// As starting point, disable IPv6 on all interfaces
err = setIPv6(n.path, "all", false)
if err != nil {
log.G(context.TODO()).Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
}
if err = n.loopbackUp(); err != nil {
n.nlHandle.Close()
return nil, err
}
return n, nil
}
func createNetworkNamespace(path string, osCreate bool) error {
if err := createNamespaceFile(path); err != nil {
return err
}
do := func() error {
return mountNetworkNamespace(fmt.Sprintf("/proc/self/task/%d/ns/net", unix.Gettid()), path)
}
if osCreate {
return unshare.Go(unix.CLONE_NEWNET, do, nil)
}
return do()
}
func unmountNamespaceFile(path string) {
if _, err := os.Stat(path); err != nil {
// ignore when we cannot stat the path
return
}
if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) {
log.G(context.TODO()).WithError(err).Error("Error unmounting namespace file")
}
}
func createNamespaceFile(path string) error {
once.Do(createBasePath)
// Remove it from garbage collection list if present
removeFromGarbagePaths(path)
// If the path is there unmount it first
unmountNamespaceFile(path)
// wait for garbage collection to complete if it is in progress
// before trying to create the file.
//
// TODO(aker): This garbage-collection was for a kernel bug in kernels 3.18-4.0.1: is this still needed on current kernels (and on kernel 3.10)? see https://github.com/moby/moby/pull/46315/commits/c0a6beba8e61d4019e1806d5241ba22007072ca2#r1331327103
gpmWg.Wait()
f, err := os.Create(path)
if err != nil {
return err
}
_ = f.Close()
return nil
}
// Namespace represents a network sandbox. It represents a Linux network
// namespace, and moves an interface into it when called on method AddInterface
// or sets the gateway etc. It holds a list of Interfaces, routes etc., and more
// can be added dynamically.
type Namespace struct {
path string
iFaces []*Interface
gw net.IP
gwv6 net.IP
staticRoutes []*types.StaticRoute
neighbors []*neigh
nextIfIndex map[string]int
isDefault bool
nlHandle *netlink.Handle
loV6Enabled bool
mu sync.Mutex
}
// Interfaces returns the collection of Interface previously added with the AddInterface
// method. Note that this doesn't include network interfaces added in any
// other way (such as the default loopback interface which is automatically
// created on creation of a sandbox).
func (n *Namespace) Interfaces() []*Interface {
ifaces := make([]*Interface, len(n.iFaces))
copy(ifaces, n.iFaces)
return ifaces
}
func (n *Namespace) loopbackUp() error {
iface, err := n.nlHandle.LinkByName("lo")
if err != nil {
return err
}
return n.nlHandle.LinkSetUp(iface)
}
// GetLoopbackIfaceName returns the name of the loopback interface
func (n *Namespace) GetLoopbackIfaceName() string {
return "lo"
}
// AddAliasIP adds the passed IP address to the named interface
func (n *Namespace) AddAliasIP(ifName string, ip *net.IPNet) error {
iface, err := n.nlHandle.LinkByName(ifName)
if err != nil {
return err
}
return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip})
}
// RemoveAliasIP removes the passed IP address from the named interface
func (n *Namespace) RemoveAliasIP(ifName string, ip *net.IPNet) error {
iface, err := n.nlHandle.LinkByName(ifName)
if err != nil {
return err
}
return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip})
}
// DisableARPForVIP disables ARP replies and requests for VIP addresses
// on a particular interface.
func (n *Namespace) DisableARPForVIP(srcName string) (Err error) {
dstName := ""
for _, i := range n.Interfaces() {
if i.SrcName() == srcName {
dstName = i.DstName()
break
}
}
if dstName == "" {
return fmt.Errorf("failed to find interface %s in sandbox", srcName)
}
err := n.InvokeFunc(func() {
path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore")
if err := os.WriteFile(path, []byte{'1', '\n'}, 0o644); err != nil {
Err = fmt.Errorf("Failed to set %s to 1: %v", path, err)
return
}
path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce")
if err := os.WriteFile(path, []byte{'2', '\n'}, 0o644); err != nil {
Err = fmt.Errorf("Failed to set %s to 2: %v", path, err)
return
}
})
if err != nil {
return err
}
return
}
// InvokeFunc invoke a function in the network namespace.
func (n *Namespace) InvokeFunc(f func()) error {
path := n.nsPath()
newNS, err := netns.GetFromPath(path)
if err != nil {
return fmt.Errorf("failed get network namespace %q: %w", path, err)
}
defer newNS.Close()
done := make(chan error, 1)
go func() {
runtime.LockOSThread()
// InvokeFunc() could have been called from a goroutine with
// tampered thread state, e.g. from another InvokeFunc()
// callback. The outer goroutine's thread state cannot be
// trusted.
origNS, err := netns.Get()
if err != nil {
runtime.UnlockOSThread()
done <- fmt.Errorf("failed to get original network namespace: %w", err)
return
}
defer origNS.Close()
if err := netns.Set(newNS); err != nil {
runtime.UnlockOSThread()
done <- err
return
}
defer func() {
close(done)
if err := netns.Set(origNS); err != nil {
log.G(context.TODO()).WithError(err).Warn("failed to restore thread's network namespace")
// Recover from the error by leaving this goroutine locked to
// the thread. The runtime will terminate the thread and replace
// it with a clean one when this goroutine returns.
} else {
runtime.UnlockOSThread()
}
}()
f()
}()
return <-done
}
func (n *Namespace) nsPath() string {
n.mu.Lock()
defer n.mu.Unlock()
return n.path
}
// Key returns the path where the network namespace is mounted.
func (n *Namespace) Key() string {
return n.path
}
// Destroy destroys the sandbox.
func (n *Namespace) Destroy() error {
if n.nlHandle != nil {
n.nlHandle.Close()
}
// Assuming no running process is executing in this network namespace,
// unmounting is sufficient to destroy it.
if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil {
return err
}
// Stash it into the garbage collection list
addToGarbagePaths(n.path)
return nil
}
// Restore restores the network namespace.
func (n *Namespace) Restore(interfaces map[Iface][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error {
// restore interfaces
for iface, opts := range interfaces {
i, err := newInterface(n, iface.SrcName, iface.DstPrefix, opts...)
if err != nil {
return err
}
if n.isDefault {
i.dstName = i.srcName
} else {
links, err := n.nlHandle.LinkList()
if err != nil {
return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path)
}
// due to the docker network connect/disconnect, so the dstName should
// restore from the namespace
for _, link := range links {
ifaceName := link.Attrs().Name
if i.dstName == "vxlan" && strings.HasPrefix(ifaceName, "vxlan") {
i.dstName = ifaceName
break
}
// find the interface name by ip
if i.address != nil {
addresses, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4)
if err != nil {
return err
}
for _, addr := range addresses {
if addr.IPNet.String() == i.address.String() {
i.dstName = ifaceName
break
}
}
if i.dstName == ifaceName {
break
}
}
// This is to find the interface name of the pair in overlay sandbox
if i.master != "" && i.dstName == "veth" && strings.HasPrefix(ifaceName, "veth") {
i.dstName = ifaceName
}
}
var index int
if idx := strings.TrimPrefix(i.dstName, iface.DstPrefix); idx != "" {
index, err = strconv.Atoi(idx)
if err != nil {
return fmt.Errorf("failed to restore interface in network namespace %q: invalid dstName for interface: %s: %v", n.path, i.dstName, err)
}
}
index++
n.mu.Lock()
if index > n.nextIfIndex[iface.DstPrefix] {
n.nextIfIndex[iface.DstPrefix] = index
}
n.iFaces = append(n.iFaces, i)
n.mu.Unlock()
}
}
// restore routes and gateways
n.mu.Lock()
n.staticRoutes = append(n.staticRoutes, routes...)
if len(gw) > 0 {
n.gw = gw
}
if len(gw6) > 0 {
n.gwv6 = gw6
}
n.mu.Unlock()
return nil
}
// Checks whether IPv6 needs to be enabled/disabled on the loopback interface
func (n *Namespace) checkLoV6() {
var (
enable = false
action = "disable"
)
n.mu.Lock()
for _, iface := range n.iFaces {
if iface.AddressIPv6() != nil {
enable = true
action = "enable"
break
}
}
n.mu.Unlock()
if n.loV6Enabled == enable {
return
}
if err := setIPv6(n.path, "lo", enable); err != nil {
log.G(context.TODO()).Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err)
}
n.loV6Enabled = enable
}
// ApplyOSTweaks applies operating system specific knobs on the sandbox.
func (n *Namespace) ApplyOSTweaks(types []SandboxType) {
for _, t := range types {
switch t {
case SandboxTypeLoadBalancer, SandboxTypeIngress:
kernel.ApplyOSTweaks(map[string]*kernel.OSValue{
// disables any special handling on port reuse of existing IPVS connection table entries
// more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32
"net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil},
// expires connection from the IPVS connection table when the backend is not available
// more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133
"net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil},
// expires persistent connections to destination servers with weights set to 0
// more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151
"net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil},
})
}
}
}
func setIPv6(nspath, iface string, enable bool) error {
errCh := make(chan error, 1)
go func() {
defer close(errCh)
namespace, err := netns.GetFromPath(nspath)
if err != nil {
errCh <- fmt.Errorf("failed get network namespace %q: %w", nspath, err)
return
}
defer namespace.Close()
runtime.LockOSThread()
origNS, err := netns.Get()
if err != nil {
runtime.UnlockOSThread()
errCh <- fmt.Errorf("failed to get current network namespace: %w", err)
return
}
defer origNS.Close()
if err = netns.Set(namespace); err != nil {
runtime.UnlockOSThread()
errCh <- fmt.Errorf("setting into container netns %q failed: %w", nspath, err)
return
}
defer func() {
if err := netns.Set(origNS); err != nil {
log.G(context.TODO()).WithError(err).Error("libnetwork: restoring thread network namespace failed")
// The error is only fatal for the current thread. Keep this
// goroutine locked to the thread to make the runtime replace it
// with a clean thread once this goroutine returns.
} else {
runtime.UnlockOSThread()
}
}()
var (
action = "disable"
value = byte('1')
path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", iface)
)
if enable {
action = "enable"
value = '0'
}
if _, err := os.Stat(path); err != nil {
if os.IsNotExist(err) {
log.G(context.TODO()).WithError(err).Warn("Cannot configure IPv6 forwarding on container interface. Has IPv6 been disabled in this node's kernel?")
return
}
errCh <- err
return
}
if err = os.WriteFile(path, []byte{value, '\n'}, 0o644); err != nil {
errCh <- fmt.Errorf("failed to %s IPv6 forwarding for container's interface %s: %w", action, iface, err)
return
}
}()
return <-errCh
}