123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674 |
- package osl
- import (
- "context"
- "errors"
- "fmt"
- "net"
- "os"
- "path/filepath"
- "runtime"
- "strconv"
- "strings"
- "sync"
- "syscall"
- "time"
- "github.com/containerd/log"
- "github.com/docker/docker/internal/unshare"
- "github.com/docker/docker/libnetwork/ns"
- "github.com/docker/docker/libnetwork/osl/kernel"
- "github.com/docker/docker/libnetwork/types"
- "github.com/vishvananda/netlink"
- "github.com/vishvananda/netns"
- "golang.org/x/sys/unix"
- )
- const defaultPrefix = "/var/run/docker"
- func init() {
- // Lock main() to the initial thread to exclude the goroutines spawned
- // by func (*Namespace) InvokeFunc() or func setIPv6() below from
- // being scheduled onto that thread. Changes to the network namespace of
- // the initial thread alter /proc/self/ns/net, which would break any
- // code which (incorrectly) assumes that that file is the network
- // namespace for the thread it is currently executing on.
- runtime.LockOSThread()
- }
- var (
- once sync.Once
- garbagePathMap = make(map[string]bool)
- gpmLock sync.Mutex
- gpmWg sync.WaitGroup
- gpmCleanupPeriod = 60 * time.Second
- gpmChan = make(chan chan struct{})
- netnsBasePath = filepath.Join(defaultPrefix, "netns")
- )
- // SetBasePath sets the base url prefix for the ns path
- func SetBasePath(path string) {
- netnsBasePath = filepath.Join(path, "netns")
- }
- func basePath() string {
- return netnsBasePath
- }
- func createBasePath() {
- err := os.MkdirAll(basePath(), 0o755)
- if err != nil {
- panic("Could not create net namespace path directory")
- }
- // Start the garbage collection go routine
- go removeUnusedPaths()
- }
- func removeUnusedPaths() {
- gpmLock.Lock()
- period := gpmCleanupPeriod
- gpmLock.Unlock()
- ticker := time.NewTicker(period)
- for {
- var (
- gc chan struct{}
- gcOk bool
- )
- select {
- case <-ticker.C:
- case gc, gcOk = <-gpmChan:
- }
- gpmLock.Lock()
- pathList := make([]string, 0, len(garbagePathMap))
- for path := range garbagePathMap {
- pathList = append(pathList, path)
- }
- garbagePathMap = make(map[string]bool)
- gpmWg.Add(1)
- gpmLock.Unlock()
- for _, path := range pathList {
- os.Remove(path)
- }
- gpmWg.Done()
- if gcOk {
- close(gc)
- }
- }
- }
- func addToGarbagePaths(path string) {
- gpmLock.Lock()
- garbagePathMap[path] = true
- gpmLock.Unlock()
- }
- func removeFromGarbagePaths(path string) {
- gpmLock.Lock()
- delete(garbagePathMap, path)
- gpmLock.Unlock()
- }
- // GC triggers garbage collection of namespace path right away
- // and waits for it.
- func GC() {
- gpmLock.Lock()
- if len(garbagePathMap) == 0 {
- // No need for GC if map is empty
- gpmLock.Unlock()
- return
- }
- gpmLock.Unlock()
- // if content exists in the garbage paths
- // we can trigger GC to run, providing a
- // channel to be notified on completion
- waitGC := make(chan struct{})
- gpmChan <- waitGC
- // wait for GC completion
- <-waitGC
- }
- // GenerateKey generates a sandbox key based on the passed
- // container id.
- func GenerateKey(containerID string) string {
- maxLen := 12
- // Read sandbox key from host for overlay
- if strings.HasPrefix(containerID, "-") {
- var (
- index int
- indexStr string
- tmpkey string
- )
- dir, err := os.ReadDir(basePath())
- if err != nil {
- return ""
- }
- for _, v := range dir {
- id := v.Name()
- if strings.HasSuffix(id, containerID[:maxLen-1]) {
- indexStr = strings.TrimSuffix(id, containerID[:maxLen-1])
- tmpindex, err := strconv.Atoi(indexStr)
- if err != nil {
- return ""
- }
- if tmpindex > index {
- index = tmpindex
- tmpkey = id
- }
- }
- }
- containerID = tmpkey
- if containerID == "" {
- return ""
- }
- }
- if len(containerID) < maxLen {
- maxLen = len(containerID)
- }
- return basePath() + "/" + containerID[:maxLen]
- }
- // NewSandbox provides a new Namespace instance created in an os specific way
- // provided a key which uniquely identifies the sandbox.
- func NewSandbox(key string, osCreate, isRestore bool) (*Namespace, error) {
- if !isRestore {
- err := createNetworkNamespace(key, osCreate)
- if err != nil {
- return nil, err
- }
- } else {
- once.Do(createBasePath)
- }
- n := &Namespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)}
- sboxNs, err := netns.GetFromPath(n.path)
- if err != nil {
- return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
- }
- defer sboxNs.Close()
- n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
- if err != nil {
- return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
- }
- err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
- if err != nil {
- log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
- }
- // In live-restore mode, IPV6 entries are getting cleaned up due to below code
- // We should retain IPV6 configurations in live-restore mode when Docker Daemon
- // comes back. It should work as it is on other cases
- // As starting point, disable IPv6 on all interfaces
- if !isRestore && !n.isDefault {
- err = setIPv6(n.path, "all", false)
- if err != nil {
- log.G(context.TODO()).Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
- }
- }
- if err = n.loopbackUp(); err != nil {
- n.nlHandle.Close()
- return nil, err
- }
- return n, nil
- }
- func mountNetworkNamespace(basePath string, lnPath string) error {
- err := syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "")
- if err != nil {
- return fmt.Errorf("bind-mount %s -> %s: %w", basePath, lnPath, err)
- }
- return nil
- }
- // GetSandboxForExternalKey returns sandbox object for the supplied path
- func GetSandboxForExternalKey(basePath string, key string) (*Namespace, error) {
- if err := createNamespaceFile(key); err != nil {
- return nil, err
- }
- if err := mountNetworkNamespace(basePath, key); err != nil {
- return nil, err
- }
- n := &Namespace{path: key, nextIfIndex: make(map[string]int)}
- sboxNs, err := netns.GetFromPath(n.path)
- if err != nil {
- return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
- }
- defer sboxNs.Close()
- n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
- if err != nil {
- return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
- }
- err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
- if err != nil {
- log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
- }
- // As starting point, disable IPv6 on all interfaces
- err = setIPv6(n.path, "all", false)
- if err != nil {
- log.G(context.TODO()).Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
- }
- if err = n.loopbackUp(); err != nil {
- n.nlHandle.Close()
- return nil, err
- }
- return n, nil
- }
- func createNetworkNamespace(path string, osCreate bool) error {
- if err := createNamespaceFile(path); err != nil {
- return err
- }
- do := func() error {
- return mountNetworkNamespace(fmt.Sprintf("/proc/self/task/%d/ns/net", unix.Gettid()), path)
- }
- if osCreate {
- return unshare.Go(unix.CLONE_NEWNET, do, nil)
- }
- return do()
- }
- func unmountNamespaceFile(path string) {
- if _, err := os.Stat(path); err != nil {
- // ignore when we cannot stat the path
- return
- }
- if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) {
- log.G(context.TODO()).WithError(err).Error("Error unmounting namespace file")
- }
- }
- func createNamespaceFile(path string) error {
- once.Do(createBasePath)
- // Remove it from garbage collection list if present
- removeFromGarbagePaths(path)
- // If the path is there unmount it first
- unmountNamespaceFile(path)
- // wait for garbage collection to complete if it is in progress
- // before trying to create the file.
- //
- // TODO(aker): This garbage-collection was for a kernel bug in kernels 3.18-4.0.1: is this still needed on current kernels (and on kernel 3.10)? see https://github.com/moby/moby/pull/46315/commits/c0a6beba8e61d4019e1806d5241ba22007072ca2#r1331327103
- gpmWg.Wait()
- f, err := os.Create(path)
- if err != nil {
- return err
- }
- _ = f.Close()
- return nil
- }
- // Namespace represents a network sandbox. It represents a Linux network
- // namespace, and moves an interface into it when called on method AddInterface
- // or sets the gateway etc. It holds a list of Interfaces, routes etc., and more
- // can be added dynamically.
- type Namespace struct {
- path string
- iFaces []*Interface
- gw net.IP
- gwv6 net.IP
- staticRoutes []*types.StaticRoute
- neighbors []*neigh
- nextIfIndex map[string]int
- isDefault bool
- nlHandle *netlink.Handle
- loV6Enabled bool
- mu sync.Mutex
- }
- // Interfaces returns the collection of Interface previously added with the AddInterface
- // method. Note that this doesn't include network interfaces added in any
- // other way (such as the default loopback interface which is automatically
- // created on creation of a sandbox).
- func (n *Namespace) Interfaces() []*Interface {
- ifaces := make([]*Interface, len(n.iFaces))
- copy(ifaces, n.iFaces)
- return ifaces
- }
- func (n *Namespace) loopbackUp() error {
- iface, err := n.nlHandle.LinkByName("lo")
- if err != nil {
- return err
- }
- return n.nlHandle.LinkSetUp(iface)
- }
- // GetLoopbackIfaceName returns the name of the loopback interface
- func (n *Namespace) GetLoopbackIfaceName() string {
- return "lo"
- }
- // AddAliasIP adds the passed IP address to the named interface
- func (n *Namespace) AddAliasIP(ifName string, ip *net.IPNet) error {
- iface, err := n.nlHandle.LinkByName(ifName)
- if err != nil {
- return err
- }
- return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip})
- }
- // RemoveAliasIP removes the passed IP address from the named interface
- func (n *Namespace) RemoveAliasIP(ifName string, ip *net.IPNet) error {
- iface, err := n.nlHandle.LinkByName(ifName)
- if err != nil {
- return err
- }
- return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip})
- }
- // DisableARPForVIP disables ARP replies and requests for VIP addresses
- // on a particular interface.
- func (n *Namespace) DisableARPForVIP(srcName string) (Err error) {
- dstName := ""
- for _, i := range n.Interfaces() {
- if i.SrcName() == srcName {
- dstName = i.DstName()
- break
- }
- }
- if dstName == "" {
- return fmt.Errorf("failed to find interface %s in sandbox", srcName)
- }
- err := n.InvokeFunc(func() {
- path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore")
- if err := os.WriteFile(path, []byte{'1', '\n'}, 0o644); err != nil {
- Err = fmt.Errorf("Failed to set %s to 1: %v", path, err)
- return
- }
- path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce")
- if err := os.WriteFile(path, []byte{'2', '\n'}, 0o644); err != nil {
- Err = fmt.Errorf("Failed to set %s to 2: %v", path, err)
- return
- }
- })
- if err != nil {
- return err
- }
- return
- }
- // InvokeFunc invoke a function in the network namespace.
- func (n *Namespace) InvokeFunc(f func()) error {
- path := n.nsPath()
- newNS, err := netns.GetFromPath(path)
- if err != nil {
- return fmt.Errorf("failed get network namespace %q: %w", path, err)
- }
- defer newNS.Close()
- done := make(chan error, 1)
- go func() {
- runtime.LockOSThread()
- // InvokeFunc() could have been called from a goroutine with
- // tampered thread state, e.g. from another InvokeFunc()
- // callback. The outer goroutine's thread state cannot be
- // trusted.
- origNS, err := netns.Get()
- if err != nil {
- runtime.UnlockOSThread()
- done <- fmt.Errorf("failed to get original network namespace: %w", err)
- return
- }
- defer origNS.Close()
- if err := netns.Set(newNS); err != nil {
- runtime.UnlockOSThread()
- done <- err
- return
- }
- defer func() {
- close(done)
- if err := netns.Set(origNS); err != nil {
- log.G(context.TODO()).WithError(err).Warn("failed to restore thread's network namespace")
- // Recover from the error by leaving this goroutine locked to
- // the thread. The runtime will terminate the thread and replace
- // it with a clean one when this goroutine returns.
- } else {
- runtime.UnlockOSThread()
- }
- }()
- f()
- }()
- return <-done
- }
- func (n *Namespace) nsPath() string {
- n.mu.Lock()
- defer n.mu.Unlock()
- return n.path
- }
- // Key returns the path where the network namespace is mounted.
- func (n *Namespace) Key() string {
- return n.path
- }
- // Destroy destroys the sandbox.
- func (n *Namespace) Destroy() error {
- if n.nlHandle != nil {
- n.nlHandle.Close()
- }
- // Assuming no running process is executing in this network namespace,
- // unmounting is sufficient to destroy it.
- if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil {
- return err
- }
- // Stash it into the garbage collection list
- addToGarbagePaths(n.path)
- return nil
- }
- // Restore restores the network namespace.
- func (n *Namespace) Restore(interfaces map[Iface][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error {
- // restore interfaces
- for iface, opts := range interfaces {
- i, err := newInterface(n, iface.SrcName, iface.DstPrefix, opts...)
- if err != nil {
- return err
- }
- if n.isDefault {
- i.dstName = i.srcName
- } else {
- links, err := n.nlHandle.LinkList()
- if err != nil {
- return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path)
- }
- // due to the docker network connect/disconnect, so the dstName should
- // restore from the namespace
- for _, link := range links {
- ifaceName := link.Attrs().Name
- if i.dstName == "vxlan" && strings.HasPrefix(ifaceName, "vxlan") {
- i.dstName = ifaceName
- break
- }
- // find the interface name by ip
- if i.address != nil {
- addresses, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4)
- if err != nil {
- return err
- }
- for _, addr := range addresses {
- if addr.IPNet.String() == i.address.String() {
- i.dstName = ifaceName
- break
- }
- }
- if i.dstName == ifaceName {
- break
- }
- }
- // This is to find the interface name of the pair in overlay sandbox
- if i.master != "" && i.dstName == "veth" && strings.HasPrefix(ifaceName, "veth") {
- i.dstName = ifaceName
- }
- }
- var index int
- if idx := strings.TrimPrefix(i.dstName, iface.DstPrefix); idx != "" {
- index, err = strconv.Atoi(idx)
- if err != nil {
- return fmt.Errorf("failed to restore interface in network namespace %q: invalid dstName for interface: %s: %v", n.path, i.dstName, err)
- }
- }
- index++
- n.mu.Lock()
- if index > n.nextIfIndex[iface.DstPrefix] {
- n.nextIfIndex[iface.DstPrefix] = index
- }
- n.iFaces = append(n.iFaces, i)
- n.mu.Unlock()
- }
- }
- // restore routes and gateways
- n.mu.Lock()
- n.staticRoutes = append(n.staticRoutes, routes...)
- if len(gw) > 0 {
- n.gw = gw
- }
- if len(gw6) > 0 {
- n.gwv6 = gw6
- }
- n.mu.Unlock()
- return nil
- }
- // Checks whether IPv6 needs to be enabled/disabled on the loopback interface
- func (n *Namespace) checkLoV6() {
- var (
- enable = false
- action = "disable"
- )
- n.mu.Lock()
- for _, iface := range n.iFaces {
- if iface.AddressIPv6() != nil {
- enable = true
- action = "enable"
- break
- }
- }
- n.mu.Unlock()
- if n.loV6Enabled == enable {
- return
- }
- if err := setIPv6(n.path, "lo", enable); err != nil {
- log.G(context.TODO()).Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err)
- }
- n.loV6Enabled = enable
- }
- // ApplyOSTweaks applies operating system specific knobs on the sandbox.
- func (n *Namespace) ApplyOSTweaks(types []SandboxType) {
- for _, t := range types {
- switch t {
- case SandboxTypeLoadBalancer, SandboxTypeIngress:
- kernel.ApplyOSTweaks(map[string]*kernel.OSValue{
- // disables any special handling on port reuse of existing IPVS connection table entries
- // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32
- "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil},
- // expires connection from the IPVS connection table when the backend is not available
- // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133
- "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil},
- // expires persistent connections to destination servers with weights set to 0
- // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151
- "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil},
- })
- }
- }
- }
- func setIPv6(nspath, iface string, enable bool) error {
- errCh := make(chan error, 1)
- go func() {
- defer close(errCh)
- namespace, err := netns.GetFromPath(nspath)
- if err != nil {
- errCh <- fmt.Errorf("failed get network namespace %q: %w", nspath, err)
- return
- }
- defer namespace.Close()
- runtime.LockOSThread()
- origNS, err := netns.Get()
- if err != nil {
- runtime.UnlockOSThread()
- errCh <- fmt.Errorf("failed to get current network namespace: %w", err)
- return
- }
- defer origNS.Close()
- if err = netns.Set(namespace); err != nil {
- runtime.UnlockOSThread()
- errCh <- fmt.Errorf("setting into container netns %q failed: %w", nspath, err)
- return
- }
- defer func() {
- if err := netns.Set(origNS); err != nil {
- log.G(context.TODO()).WithError(err).Error("libnetwork: restoring thread network namespace failed")
- // The error is only fatal for the current thread. Keep this
- // goroutine locked to the thread to make the runtime replace it
- // with a clean thread once this goroutine returns.
- } else {
- runtime.UnlockOSThread()
- }
- }()
- var (
- action = "disable"
- value = byte('1')
- path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", iface)
- )
- if enable {
- action = "enable"
- value = '0'
- }
- if _, err := os.Stat(path); err != nil {
- if os.IsNotExist(err) {
- log.G(context.TODO()).WithError(err).Warn("Cannot configure IPv6 forwarding on container interface. Has IPv6 been disabled in this node's kernel?")
- return
- }
- errCh <- err
- return
- }
- if err = os.WriteFile(path, []byte{value, '\n'}, 0o644); err != nil {
- errCh <- fmt.Errorf("failed to %s IPv6 forwarding for container's interface %s: %w", action, iface, err)
- return
- }
- }()
- return <-errCh
- }
|