namespace_linux.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664
  1. package osl
  2. import (
  3. "context"
  4. "errors"
  5. "fmt"
  6. "net"
  7. "os"
  8. "path/filepath"
  9. "runtime"
  10. "strconv"
  11. "strings"
  12. "sync"
  13. "syscall"
  14. "time"
  15. "github.com/containerd/log"
  16. "github.com/docker/docker/internal/unshare"
  17. "github.com/docker/docker/libnetwork/ns"
  18. "github.com/docker/docker/libnetwork/osl/kernel"
  19. "github.com/docker/docker/libnetwork/types"
  20. "github.com/vishvananda/netlink"
  21. "github.com/vishvananda/netlink/nl"
  22. "github.com/vishvananda/netns"
  23. "golang.org/x/sys/unix"
  24. )
  25. const defaultPrefix = "/var/run/docker"
  26. func init() {
  27. // Lock main() to the initial thread to exclude the goroutines spawned
  28. // by func (*Namespace) InvokeFunc() or func setIPv6() below from
  29. // being scheduled onto that thread. Changes to the network namespace of
  30. // the initial thread alter /proc/self/ns/net, which would break any
  31. // code which (incorrectly) assumes that the file is the network
  32. // namespace for the thread it is currently executing on.
  33. runtime.LockOSThread()
  34. }
  35. var (
  36. once sync.Once
  37. garbagePathMap = make(map[string]bool)
  38. gpmLock sync.Mutex
  39. gpmWg sync.WaitGroup
  40. gpmCleanupPeriod = 60 * time.Second
  41. gpmChan = make(chan chan struct{})
  42. netnsBasePath = filepath.Join(defaultPrefix, "netns")
  43. )
  44. // SetBasePath sets the base url prefix for the ns path
  45. func SetBasePath(path string) {
  46. netnsBasePath = filepath.Join(path, "netns")
  47. }
  48. func basePath() string {
  49. return netnsBasePath
  50. }
  51. func createBasePath() {
  52. err := os.MkdirAll(basePath(), 0o755)
  53. if err != nil {
  54. panic("Could not create net namespace path directory")
  55. }
  56. // Start the garbage collection go routine
  57. go removeUnusedPaths()
  58. }
  59. func removeUnusedPaths() {
  60. gpmLock.Lock()
  61. period := gpmCleanupPeriod
  62. gpmLock.Unlock()
  63. ticker := time.NewTicker(period)
  64. for {
  65. var (
  66. gc chan struct{}
  67. gcOk bool
  68. )
  69. select {
  70. case <-ticker.C:
  71. case gc, gcOk = <-gpmChan:
  72. }
  73. gpmLock.Lock()
  74. pathList := make([]string, 0, len(garbagePathMap))
  75. for path := range garbagePathMap {
  76. pathList = append(pathList, path)
  77. }
  78. garbagePathMap = make(map[string]bool)
  79. gpmWg.Add(1)
  80. gpmLock.Unlock()
  81. for _, path := range pathList {
  82. os.Remove(path)
  83. }
  84. gpmWg.Done()
  85. if gcOk {
  86. close(gc)
  87. }
  88. }
  89. }
  90. func addToGarbagePaths(path string) {
  91. gpmLock.Lock()
  92. garbagePathMap[path] = true
  93. gpmLock.Unlock()
  94. }
  95. func removeFromGarbagePaths(path string) {
  96. gpmLock.Lock()
  97. delete(garbagePathMap, path)
  98. gpmLock.Unlock()
  99. }
  100. // GC triggers garbage collection of namespace path right away
  101. // and waits for it.
  102. func GC() {
  103. gpmLock.Lock()
  104. if len(garbagePathMap) == 0 {
  105. // No need for GC if map is empty
  106. gpmLock.Unlock()
  107. return
  108. }
  109. gpmLock.Unlock()
  110. // if content exists in the garbage paths
  111. // we can trigger GC to run, providing a
  112. // channel to be notified on completion
  113. waitGC := make(chan struct{})
  114. gpmChan <- waitGC
  115. // wait for GC completion
  116. <-waitGC
  117. }
  118. // GenerateKey generates a sandbox key based on the passed
  119. // container id.
  120. func GenerateKey(containerID string) string {
  121. maxLen := 12
  122. // Read sandbox key from host for overlay
  123. if strings.HasPrefix(containerID, "-") {
  124. var (
  125. index int
  126. indexStr string
  127. tmpkey string
  128. )
  129. dir, err := os.ReadDir(basePath())
  130. if err != nil {
  131. return ""
  132. }
  133. for _, v := range dir {
  134. id := v.Name()
  135. if strings.HasSuffix(id, containerID[:maxLen-1]) {
  136. indexStr = strings.TrimSuffix(id, containerID[:maxLen-1])
  137. tmpindex, err := strconv.Atoi(indexStr)
  138. if err != nil {
  139. return ""
  140. }
  141. if tmpindex > index {
  142. index = tmpindex
  143. tmpkey = id
  144. }
  145. }
  146. }
  147. containerID = tmpkey
  148. if containerID == "" {
  149. return ""
  150. }
  151. }
  152. if len(containerID) < maxLen {
  153. maxLen = len(containerID)
  154. }
  155. return basePath() + "/" + containerID[:maxLen]
  156. }
  157. // NewSandbox provides a new Namespace instance created in an os specific way
  158. // provided a key which uniquely identifies the sandbox.
  159. func NewSandbox(key string, osCreate, isRestore bool) (*Namespace, error) {
  160. if !isRestore {
  161. err := createNetworkNamespace(key, osCreate)
  162. if err != nil {
  163. return nil, err
  164. }
  165. } else {
  166. once.Do(createBasePath)
  167. }
  168. n := &Namespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)}
  169. sboxNs, err := netns.GetFromPath(n.path)
  170. if err != nil {
  171. return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
  172. }
  173. defer sboxNs.Close()
  174. n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
  175. if err != nil {
  176. return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
  177. }
  178. err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
  179. if err != nil {
  180. log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
  181. }
  182. if err = n.loopbackUp(); err != nil {
  183. n.nlHandle.Close()
  184. return nil, err
  185. }
  186. return n, nil
  187. }
  188. func mountNetworkNamespace(basePath string, lnPath string) error {
  189. err := syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "")
  190. if err != nil {
  191. return fmt.Errorf("bind-mount %s -> %s: %w", basePath, lnPath, err)
  192. }
  193. return nil
  194. }
  195. // GetSandboxForExternalKey returns sandbox object for the supplied path
  196. func GetSandboxForExternalKey(basePath string, key string) (*Namespace, error) {
  197. if err := createNamespaceFile(key); err != nil {
  198. return nil, err
  199. }
  200. if err := mountNetworkNamespace(basePath, key); err != nil {
  201. return nil, err
  202. }
  203. n := &Namespace{path: key, nextIfIndex: make(map[string]int)}
  204. sboxNs, err := netns.GetFromPath(n.path)
  205. if err != nil {
  206. return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
  207. }
  208. defer sboxNs.Close()
  209. n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
  210. if err != nil {
  211. return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
  212. }
  213. err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
  214. if err != nil {
  215. log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
  216. }
  217. if err = n.loopbackUp(); err != nil {
  218. n.nlHandle.Close()
  219. return nil, err
  220. }
  221. return n, nil
  222. }
  223. func createNetworkNamespace(path string, osCreate bool) error {
  224. if err := createNamespaceFile(path); err != nil {
  225. return err
  226. }
  227. do := func() error {
  228. return mountNetworkNamespace(fmt.Sprintf("/proc/self/task/%d/ns/net", unix.Gettid()), path)
  229. }
  230. if osCreate {
  231. return unshare.Go(unix.CLONE_NEWNET, do, nil)
  232. }
  233. return do()
  234. }
  235. func unmountNamespaceFile(path string) {
  236. if _, err := os.Stat(path); err != nil {
  237. // ignore when we cannot stat the path
  238. return
  239. }
  240. if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) {
  241. log.G(context.TODO()).WithError(err).Error("Error unmounting namespace file")
  242. }
  243. }
  244. func createNamespaceFile(path string) error {
  245. once.Do(createBasePath)
  246. // Remove it from garbage collection list if present
  247. removeFromGarbagePaths(path)
  248. // If the path is there unmount it first
  249. unmountNamespaceFile(path)
  250. // wait for garbage collection to complete if it is in progress
  251. // before trying to create the file.
  252. //
  253. // TODO(aker): This garbage-collection was for a kernel bug in kernels 3.18-4.0.1: is this still needed on current kernels (and on kernel 3.10)? see https://github.com/moby/moby/pull/46315/commits/c0a6beba8e61d4019e1806d5241ba22007072ca2#r1331327103
  254. gpmWg.Wait()
  255. f, err := os.Create(path)
  256. if err != nil {
  257. return err
  258. }
  259. _ = f.Close()
  260. return nil
  261. }
  262. // Namespace represents a network sandbox. It represents a Linux network
  263. // namespace, and moves an interface into it when called on method AddInterface
  264. // or sets the gateway etc. It holds a list of Interfaces, routes etc., and more
  265. // can be added dynamically.
  266. type Namespace struct {
  267. path string
  268. iFaces []*Interface
  269. gw net.IP
  270. gwv6 net.IP
  271. staticRoutes []*types.StaticRoute
  272. neighbors []*neigh
  273. nextIfIndex map[string]int
  274. isDefault bool
  275. ipv6LoEnabledOnce sync.Once
  276. ipv6LoEnabledCached bool
  277. nlHandle *netlink.Handle
  278. mu sync.Mutex
  279. }
  280. // Interfaces returns the collection of Interface previously added with the AddInterface
  281. // method. Note that this doesn't include network interfaces added in any
  282. // other way (such as the default loopback interface which is automatically
  283. // created on creation of a sandbox).
  284. func (n *Namespace) Interfaces() []*Interface {
  285. ifaces := make([]*Interface, len(n.iFaces))
  286. copy(ifaces, n.iFaces)
  287. return ifaces
  288. }
  289. func (n *Namespace) loopbackUp() error {
  290. iface, err := n.nlHandle.LinkByName("lo")
  291. if err != nil {
  292. return err
  293. }
  294. return n.nlHandle.LinkSetUp(iface)
  295. }
  296. // GetLoopbackIfaceName returns the name of the loopback interface
  297. func (n *Namespace) GetLoopbackIfaceName() string {
  298. return "lo"
  299. }
  300. // AddAliasIP adds the passed IP address to the named interface
  301. func (n *Namespace) AddAliasIP(ifName string, ip *net.IPNet) error {
  302. iface, err := n.nlHandle.LinkByName(ifName)
  303. if err != nil {
  304. return err
  305. }
  306. return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip})
  307. }
  308. // RemoveAliasIP removes the passed IP address from the named interface
  309. func (n *Namespace) RemoveAliasIP(ifName string, ip *net.IPNet) error {
  310. iface, err := n.nlHandle.LinkByName(ifName)
  311. if err != nil {
  312. return err
  313. }
  314. return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip})
  315. }
  316. // DisableARPForVIP disables ARP replies and requests for VIP addresses
  317. // on a particular interface.
  318. func (n *Namespace) DisableARPForVIP(srcName string) (Err error) {
  319. dstName := ""
  320. for _, i := range n.Interfaces() {
  321. if i.SrcName() == srcName {
  322. dstName = i.DstName()
  323. break
  324. }
  325. }
  326. if dstName == "" {
  327. return fmt.Errorf("failed to find interface %s in sandbox", srcName)
  328. }
  329. err := n.InvokeFunc(func() {
  330. path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore")
  331. if err := os.WriteFile(path, []byte{'1', '\n'}, 0o644); err != nil {
  332. Err = fmt.Errorf("Failed to set %s to 1: %v", path, err)
  333. return
  334. }
  335. path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce")
  336. if err := os.WriteFile(path, []byte{'2', '\n'}, 0o644); err != nil {
  337. Err = fmt.Errorf("Failed to set %s to 2: %v", path, err)
  338. return
  339. }
  340. })
  341. if err != nil {
  342. return err
  343. }
  344. return
  345. }
  346. // InvokeFunc invoke a function in the network namespace.
  347. func (n *Namespace) InvokeFunc(f func()) error {
  348. path := n.nsPath()
  349. newNS, err := netns.GetFromPath(path)
  350. if err != nil {
  351. return fmt.Errorf("failed get network namespace %q: %w", path, err)
  352. }
  353. defer newNS.Close()
  354. done := make(chan error, 1)
  355. go func() {
  356. runtime.LockOSThread()
  357. // InvokeFunc() could have been called from a goroutine with
  358. // tampered thread state, e.g. from another InvokeFunc()
  359. // callback. The outer goroutine's thread state cannot be
  360. // trusted.
  361. origNS, err := netns.Get()
  362. if err != nil {
  363. runtime.UnlockOSThread()
  364. done <- fmt.Errorf("failed to get original network namespace: %w", err)
  365. return
  366. }
  367. defer origNS.Close()
  368. if err := netns.Set(newNS); err != nil {
  369. runtime.UnlockOSThread()
  370. done <- err
  371. return
  372. }
  373. defer func() {
  374. close(done)
  375. if err := netns.Set(origNS); err != nil {
  376. log.G(context.TODO()).WithError(err).Warn("failed to restore thread's network namespace")
  377. // Recover from the error by leaving this goroutine locked to
  378. // the thread. The runtime will terminate the thread and replace
  379. // it with a clean one when this goroutine returns.
  380. } else {
  381. runtime.UnlockOSThread()
  382. }
  383. }()
  384. f()
  385. }()
  386. return <-done
  387. }
  388. func (n *Namespace) nsPath() string {
  389. n.mu.Lock()
  390. defer n.mu.Unlock()
  391. return n.path
  392. }
  393. // Key returns the path where the network namespace is mounted.
  394. func (n *Namespace) Key() string {
  395. return n.path
  396. }
  397. // Destroy destroys the sandbox.
  398. func (n *Namespace) Destroy() error {
  399. if n.nlHandle != nil {
  400. n.nlHandle.Close()
  401. }
  402. // Assuming no running process is executing in this network namespace,
  403. // unmounting is sufficient to destroy it.
  404. if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil {
  405. return err
  406. }
  407. // Stash it into the garbage collection list
  408. addToGarbagePaths(n.path)
  409. return nil
  410. }
  411. // Restore restores the network namespace.
  412. func (n *Namespace) Restore(interfaces map[Iface][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error {
  413. // restore interfaces
  414. for iface, opts := range interfaces {
  415. i, err := newInterface(n, iface.SrcName, iface.DstPrefix, opts...)
  416. if err != nil {
  417. return err
  418. }
  419. if n.isDefault {
  420. i.dstName = i.srcName
  421. } else {
  422. links, err := n.nlHandle.LinkList()
  423. if err != nil {
  424. return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path)
  425. }
  426. // due to the docker network connect/disconnect, so the dstName should
  427. // restore from the namespace
  428. for _, link := range links {
  429. ifaceName := link.Attrs().Name
  430. if i.dstName == "vxlan" && strings.HasPrefix(ifaceName, "vxlan") {
  431. i.dstName = ifaceName
  432. break
  433. }
  434. // find the interface name by ip
  435. if i.address != nil {
  436. addresses, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4)
  437. if err != nil {
  438. return err
  439. }
  440. for _, addr := range addresses {
  441. if addr.IPNet.String() == i.address.String() {
  442. i.dstName = ifaceName
  443. break
  444. }
  445. }
  446. if i.dstName == ifaceName {
  447. break
  448. }
  449. }
  450. // This is to find the interface name of the pair in overlay sandbox
  451. if i.master != "" && i.dstName == "veth" && strings.HasPrefix(ifaceName, "veth") {
  452. i.dstName = ifaceName
  453. }
  454. }
  455. var index int
  456. if idx := strings.TrimPrefix(i.dstName, iface.DstPrefix); idx != "" {
  457. index, err = strconv.Atoi(idx)
  458. if err != nil {
  459. return fmt.Errorf("failed to restore interface in network namespace %q: invalid dstName for interface: %s: %v", n.path, i.dstName, err)
  460. }
  461. }
  462. index++
  463. n.mu.Lock()
  464. if index > n.nextIfIndex[iface.DstPrefix] {
  465. n.nextIfIndex[iface.DstPrefix] = index
  466. }
  467. n.iFaces = append(n.iFaces, i)
  468. n.mu.Unlock()
  469. }
  470. }
  471. // restore routes and gateways
  472. n.mu.Lock()
  473. n.staticRoutes = append(n.staticRoutes, routes...)
  474. if len(gw) > 0 {
  475. n.gw = gw
  476. }
  477. if len(gw6) > 0 {
  478. n.gwv6 = gw6
  479. }
  480. n.mu.Unlock()
  481. return nil
  482. }
  483. // IPv6LoEnabled returns true if the loopback interface had an IPv6 address when
  484. // last checked. It's always checked on the first call, and by RefreshIPv6LoEnabled.
  485. // ('::1' is assigned by the kernel if IPv6 is enabled.)
  486. func (n *Namespace) IPv6LoEnabled() bool {
  487. n.ipv6LoEnabledOnce.Do(func() {
  488. n.RefreshIPv6LoEnabled()
  489. })
  490. n.mu.Lock()
  491. defer n.mu.Unlock()
  492. return n.ipv6LoEnabledCached
  493. }
  494. // RefreshIPv6LoEnabled refreshes the cached result returned by IPv6LoEnabled.
  495. func (n *Namespace) RefreshIPv6LoEnabled() {
  496. n.mu.Lock()
  497. defer n.mu.Unlock()
  498. // If anything goes wrong, assume no-IPv6.
  499. n.ipv6LoEnabledCached = false
  500. iface, err := n.nlHandle.LinkByName("lo")
  501. if err != nil {
  502. log.G(context.TODO()).WithError(err).Warn("Unable to find 'lo' to determine IPv6 support")
  503. return
  504. }
  505. addrs, err := n.nlHandle.AddrList(iface, nl.FAMILY_V6)
  506. if err != nil {
  507. log.G(context.TODO()).WithError(err).Warn("Unable to get 'lo' addresses to determine IPv6 support")
  508. return
  509. }
  510. n.ipv6LoEnabledCached = len(addrs) > 0
  511. }
  512. // ApplyOSTweaks applies operating system specific knobs on the sandbox.
  513. func (n *Namespace) ApplyOSTweaks(types []SandboxType) {
  514. for _, t := range types {
  515. switch t {
  516. case SandboxTypeLoadBalancer, SandboxTypeIngress:
  517. kernel.ApplyOSTweaks(map[string]*kernel.OSValue{
  518. // disables any special handling on port reuse of existing IPVS connection table entries
  519. // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32
  520. "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil},
  521. // expires connection from the IPVS connection table when the backend is not available
  522. // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133
  523. "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil},
  524. // expires persistent connections to destination servers with weights set to 0
  525. // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151
  526. "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil},
  527. })
  528. }
  529. }
  530. }
  531. func setIPv6(nspath, iface string, enable bool) error {
  532. errCh := make(chan error, 1)
  533. go func() {
  534. defer close(errCh)
  535. namespace, err := netns.GetFromPath(nspath)
  536. if err != nil {
  537. errCh <- fmt.Errorf("failed get network namespace %q: %w", nspath, err)
  538. return
  539. }
  540. defer namespace.Close()
  541. runtime.LockOSThread()
  542. origNS, err := netns.Get()
  543. if err != nil {
  544. runtime.UnlockOSThread()
  545. errCh <- fmt.Errorf("failed to get current network namespace: %w", err)
  546. return
  547. }
  548. defer origNS.Close()
  549. if err = netns.Set(namespace); err != nil {
  550. runtime.UnlockOSThread()
  551. errCh <- fmt.Errorf("setting into container netns %q failed: %w", nspath, err)
  552. return
  553. }
  554. defer func() {
  555. if err := netns.Set(origNS); err != nil {
  556. log.G(context.TODO()).WithError(err).Error("libnetwork: restoring thread network namespace failed")
  557. // The error is only fatal for the current thread. Keep this
  558. // goroutine locked to the thread to make the runtime replace it
  559. // with a clean thread once this goroutine returns.
  560. } else {
  561. runtime.UnlockOSThread()
  562. }
  563. }()
  564. var (
  565. action = "disable"
  566. value = byte('1')
  567. path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", iface)
  568. )
  569. if enable {
  570. action = "enable"
  571. value = '0'
  572. }
  573. if _, err := os.Stat(path); err != nil {
  574. if os.IsNotExist(err) {
  575. log.G(context.TODO()).WithError(err).Warn("Cannot configure IPv6 forwarding on container interface. Has IPv6 been disabled in this node's kernel?")
  576. return
  577. }
  578. errCh <- err
  579. return
  580. }
  581. if err = os.WriteFile(path, []byte{value, '\n'}, 0o644); err != nil {
  582. errCh <- fmt.Errorf("failed to %s IPv6 forwarding for container's interface %s: %w", action, iface, err)
  583. return
  584. }
  585. }()
  586. return <-errCh
  587. }