namespace_linux.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694
  1. package osl
  2. import (
  3. "context"
  4. "errors"
  5. "fmt"
  6. "net"
  7. "os"
  8. "path/filepath"
  9. "runtime"
  10. "strconv"
  11. "strings"
  12. "sync"
  13. "syscall"
  14. "time"
  15. "github.com/containerd/containerd/log"
  16. "github.com/docker/docker/internal/unshare"
  17. "github.com/docker/docker/libnetwork/ns"
  18. "github.com/docker/docker/libnetwork/osl/kernel"
  19. "github.com/docker/docker/libnetwork/types"
  20. "github.com/vishvananda/netlink"
  21. "github.com/vishvananda/netns"
  22. "golang.org/x/sys/unix"
  23. )
  24. const defaultPrefix = "/var/run/docker"
  25. func init() {
  26. // Lock main() to the initial thread to exclude the goroutines spawned
  27. // by func (*networkNamespace) InvokeFunc() or func setIPv6() below from
  28. // being scheduled onto that thread. Changes to the network namespace of
  29. // the initial thread alter /proc/self/ns/net, which would break any
  30. // code which (incorrectly) assumes that that file is the network
  31. // namespace for the thread it is currently executing on.
  32. runtime.LockOSThread()
  33. }
  34. var (
  35. once sync.Once
  36. garbagePathMap = make(map[string]bool)
  37. gpmLock sync.Mutex
  38. gpmWg sync.WaitGroup
  39. gpmCleanupPeriod = 60 * time.Second
  40. gpmChan = make(chan chan struct{})
  41. netnsBasePath = filepath.Join(defaultPrefix, "netns")
  42. )
  43. // SetBasePath sets the base url prefix for the ns path
  44. func SetBasePath(path string) {
  45. netnsBasePath = filepath.Join(path, "netns")
  46. }
  47. func basePath() string {
  48. return netnsBasePath
  49. }
  50. func createBasePath() {
  51. err := os.MkdirAll(basePath(), 0o755)
  52. if err != nil {
  53. panic("Could not create net namespace path directory")
  54. }
  55. // Start the garbage collection go routine
  56. go removeUnusedPaths()
  57. }
  58. func removeUnusedPaths() {
  59. gpmLock.Lock()
  60. period := gpmCleanupPeriod
  61. gpmLock.Unlock()
  62. ticker := time.NewTicker(period)
  63. for {
  64. var (
  65. gc chan struct{}
  66. gcOk bool
  67. )
  68. select {
  69. case <-ticker.C:
  70. case gc, gcOk = <-gpmChan:
  71. }
  72. gpmLock.Lock()
  73. pathList := make([]string, 0, len(garbagePathMap))
  74. for path := range garbagePathMap {
  75. pathList = append(pathList, path)
  76. }
  77. garbagePathMap = make(map[string]bool)
  78. gpmWg.Add(1)
  79. gpmLock.Unlock()
  80. for _, path := range pathList {
  81. os.Remove(path)
  82. }
  83. gpmWg.Done()
  84. if gcOk {
  85. close(gc)
  86. }
  87. }
  88. }
  89. func addToGarbagePaths(path string) {
  90. gpmLock.Lock()
  91. garbagePathMap[path] = true
  92. gpmLock.Unlock()
  93. }
  94. func removeFromGarbagePaths(path string) {
  95. gpmLock.Lock()
  96. delete(garbagePathMap, path)
  97. gpmLock.Unlock()
  98. }
  99. // GC triggers garbage collection of namespace path right away
  100. // and waits for it.
  101. func GC() {
  102. gpmLock.Lock()
  103. if len(garbagePathMap) == 0 {
  104. // No need for GC if map is empty
  105. gpmLock.Unlock()
  106. return
  107. }
  108. gpmLock.Unlock()
  109. // if content exists in the garbage paths
  110. // we can trigger GC to run, providing a
  111. // channel to be notified on completion
  112. waitGC := make(chan struct{})
  113. gpmChan <- waitGC
  114. // wait for GC completion
  115. <-waitGC
  116. }
  117. // GenerateKey generates a sandbox key based on the passed
  118. // container id.
  119. func GenerateKey(containerID string) string {
  120. maxLen := 12
  121. // Read sandbox key from host for overlay
  122. if strings.HasPrefix(containerID, "-") {
  123. var (
  124. index int
  125. indexStr string
  126. tmpkey string
  127. )
  128. dir, err := os.ReadDir(basePath())
  129. if err != nil {
  130. return ""
  131. }
  132. for _, v := range dir {
  133. id := v.Name()
  134. if strings.HasSuffix(id, containerID[:maxLen-1]) {
  135. indexStr = strings.TrimSuffix(id, containerID[:maxLen-1])
  136. tmpindex, err := strconv.Atoi(indexStr)
  137. if err != nil {
  138. return ""
  139. }
  140. if tmpindex > index {
  141. index = tmpindex
  142. tmpkey = id
  143. }
  144. }
  145. }
  146. containerID = tmpkey
  147. if containerID == "" {
  148. return ""
  149. }
  150. }
  151. if len(containerID) < maxLen {
  152. maxLen = len(containerID)
  153. }
  154. return basePath() + "/" + containerID[:maxLen]
  155. }
  156. // NewSandbox provides a new sandbox instance created in an os specific way
  157. // provided a key which uniquely identifies the sandbox
  158. func NewSandbox(key string, osCreate, isRestore bool) (Sandbox, error) {
  159. if !isRestore {
  160. err := createNetworkNamespace(key, osCreate)
  161. if err != nil {
  162. return nil, err
  163. }
  164. } else {
  165. once.Do(createBasePath)
  166. }
  167. n := &networkNamespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)}
  168. sboxNs, err := netns.GetFromPath(n.path)
  169. if err != nil {
  170. return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
  171. }
  172. defer sboxNs.Close()
  173. n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
  174. if err != nil {
  175. return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
  176. }
  177. err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
  178. if err != nil {
  179. log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
  180. }
  181. // In live-restore mode, IPV6 entries are getting cleaned up due to below code
  182. // We should retain IPV6 configurations in live-restore mode when Docker Daemon
  183. // comes back. It should work as it is on other cases
  184. // As starting point, disable IPv6 on all interfaces
  185. if !isRestore && !n.isDefault {
  186. err = setIPv6(n.path, "all", false)
  187. if err != nil {
  188. log.G(context.TODO()).Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
  189. }
  190. }
  191. if err = n.loopbackUp(); err != nil {
  192. n.nlHandle.Close()
  193. return nil, err
  194. }
  195. return n, nil
  196. }
  197. func mountNetworkNamespace(basePath string, lnPath string) error {
  198. return syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "")
  199. }
  200. // GetSandboxForExternalKey returns sandbox object for the supplied path
  201. func GetSandboxForExternalKey(basePath string, key string) (Sandbox, error) {
  202. if err := createNamespaceFile(key); err != nil {
  203. return nil, err
  204. }
  205. if err := mountNetworkNamespace(basePath, key); err != nil {
  206. return nil, err
  207. }
  208. n := &networkNamespace{path: key, nextIfIndex: make(map[string]int)}
  209. sboxNs, err := netns.GetFromPath(n.path)
  210. if err != nil {
  211. return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
  212. }
  213. defer sboxNs.Close()
  214. n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
  215. if err != nil {
  216. return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
  217. }
  218. err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
  219. if err != nil {
  220. log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
  221. }
  222. // As starting point, disable IPv6 on all interfaces
  223. err = setIPv6(n.path, "all", false)
  224. if err != nil {
  225. log.G(context.TODO()).Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
  226. }
  227. if err = n.loopbackUp(); err != nil {
  228. n.nlHandle.Close()
  229. return nil, err
  230. }
  231. return n, nil
  232. }
  233. func createNetworkNamespace(path string, osCreate bool) error {
  234. if err := createNamespaceFile(path); err != nil {
  235. return err
  236. }
  237. do := func() error {
  238. return mountNetworkNamespace(fmt.Sprintf("/proc/self/task/%d/ns/net", unix.Gettid()), path)
  239. }
  240. if osCreate {
  241. return unshare.Go(unix.CLONE_NEWNET, do, nil)
  242. }
  243. return do()
  244. }
  245. func unmountNamespaceFile(path string) {
  246. if _, err := os.Stat(path); err == nil {
  247. if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) {
  248. log.G(context.TODO()).WithError(err).Error("Error unmounting namespace file")
  249. }
  250. }
  251. }
  252. func createNamespaceFile(path string) (err error) {
  253. var f *os.File
  254. once.Do(createBasePath)
  255. // Remove it from garbage collection list if present
  256. removeFromGarbagePaths(path)
  257. // If the path is there unmount it first
  258. unmountNamespaceFile(path)
  259. // wait for garbage collection to complete if it is in progress
  260. // before trying to create the file.
  261. gpmWg.Wait()
  262. if f, err = os.Create(path); err == nil {
  263. f.Close()
  264. }
  265. return err
  266. }
  267. // networkNamespace represents a network sandbox. It represents a Linux network
  268. // namespace, and moves an interface into it when called on method AddInterface
  269. // or sets the gateway etc. It holds a list of Interfaces, routes etc., and more
  270. // can be added dynamically.
  271. type networkNamespace struct {
  272. path string
  273. iFaces []*Interface
  274. gw net.IP
  275. gwv6 net.IP
  276. staticRoutes []*types.StaticRoute
  277. neighbors []*neigh
  278. nextIfIndex map[string]int
  279. isDefault bool
  280. nlHandle *netlink.Handle
  281. loV6Enabled bool
  282. sync.Mutex
  283. }
  284. // Interfaces returns the collection of Interface previously added with the AddInterface
  285. // method. Note that this doesn't include network interfaces added in any
  286. // other way (such as the default loopback interface which is automatically
  287. // created on creation of a sandbox).
  288. func (n *networkNamespace) Interfaces() []*Interface {
  289. ifaces := make([]*Interface, len(n.iFaces))
  290. copy(ifaces, n.iFaces)
  291. return ifaces
  292. }
  293. func (n *networkNamespace) loopbackUp() error {
  294. iface, err := n.nlHandle.LinkByName("lo")
  295. if err != nil {
  296. return err
  297. }
  298. return n.nlHandle.LinkSetUp(iface)
  299. }
  300. // GetLoopbackIfaceName returns the name of the loopback interface
  301. func (n *networkNamespace) GetLoopbackIfaceName() string {
  302. return "lo"
  303. }
  304. // AddAliasIP adds the passed IP address to the named interface
  305. func (n *networkNamespace) AddAliasIP(ifName string, ip *net.IPNet) error {
  306. iface, err := n.nlHandle.LinkByName(ifName)
  307. if err != nil {
  308. return err
  309. }
  310. return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip})
  311. }
  312. // RemoveAliasIP removes the passed IP address from the named interface
  313. func (n *networkNamespace) RemoveAliasIP(ifName string, ip *net.IPNet) error {
  314. iface, err := n.nlHandle.LinkByName(ifName)
  315. if err != nil {
  316. return err
  317. }
  318. return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip})
  319. }
  320. // DisableARPForVIP disables ARP replies and requests for VIP addresses
  321. // on a particular interface.
  322. func (n *networkNamespace) DisableARPForVIP(srcName string) (Err error) {
  323. dstName := ""
  324. for _, i := range n.Interfaces() {
  325. if i.SrcName() == srcName {
  326. dstName = i.DstName()
  327. break
  328. }
  329. }
  330. if dstName == "" {
  331. return fmt.Errorf("failed to find interface %s in sandbox", srcName)
  332. }
  333. err := n.InvokeFunc(func() {
  334. path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore")
  335. if err := os.WriteFile(path, []byte{'1', '\n'}, 0o644); err != nil {
  336. Err = fmt.Errorf("Failed to set %s to 1: %v", path, err)
  337. return
  338. }
  339. path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce")
  340. if err := os.WriteFile(path, []byte{'2', '\n'}, 0o644); err != nil {
  341. Err = fmt.Errorf("Failed to set %s to 2: %v", path, err)
  342. return
  343. }
  344. })
  345. if err != nil {
  346. return err
  347. }
  348. return
  349. }
  350. // InvokeFunc invoke a function in the network namespace.
  351. func (n *networkNamespace) InvokeFunc(f func()) error {
  352. path := n.nsPath()
  353. newNS, err := netns.GetFromPath(path)
  354. if err != nil {
  355. return fmt.Errorf("failed get network namespace %q: %w", path, err)
  356. }
  357. defer newNS.Close()
  358. done := make(chan error, 1)
  359. go func() {
  360. runtime.LockOSThread()
  361. // InvokeFunc() could have been called from a goroutine with
  362. // tampered thread state, e.g. from another InvokeFunc()
  363. // callback. The outer goroutine's thread state cannot be
  364. // trusted.
  365. origNS, err := netns.Get()
  366. if err != nil {
  367. runtime.UnlockOSThread()
  368. done <- fmt.Errorf("failed to get original network namespace: %w", err)
  369. return
  370. }
  371. defer origNS.Close()
  372. if err := netns.Set(newNS); err != nil {
  373. runtime.UnlockOSThread()
  374. done <- err
  375. return
  376. }
  377. defer func() {
  378. close(done)
  379. if err := netns.Set(origNS); err != nil {
  380. log.G(context.TODO()).WithError(err).Warn("failed to restore thread's network namespace")
  381. // Recover from the error by leaving this goroutine locked to
  382. // the thread. The runtime will terminate the thread and replace
  383. // it with a clean one when this goroutine returns.
  384. } else {
  385. runtime.UnlockOSThread()
  386. }
  387. }()
  388. f()
  389. }()
  390. return <-done
  391. }
  392. func (n *networkNamespace) nsPath() string {
  393. n.Lock()
  394. defer n.Unlock()
  395. return n.path
  396. }
  397. // Key returns the path where the network namespace is mounted.
  398. func (n *networkNamespace) Key() string {
  399. return n.path
  400. }
  401. // Destroy destroys the sandbox.
  402. func (n *networkNamespace) Destroy() error {
  403. if n.nlHandle != nil {
  404. n.nlHandle.Close()
  405. }
  406. // Assuming no running process is executing in this network namespace,
  407. // unmounting is sufficient to destroy it.
  408. if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil {
  409. return err
  410. }
  411. // Stash it into the garbage collection list
  412. addToGarbagePaths(n.path)
  413. return nil
  414. }
  415. // Restore restores the network namespace.
  416. func (n *networkNamespace) Restore(ifsopt map[Iface][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error {
  417. // restore interfaces
  418. for name, opts := range ifsopt {
  419. i := &Interface{
  420. srcName: name.SrcName,
  421. dstName: name.DstPrefix,
  422. ns: n,
  423. }
  424. if err := i.processInterfaceOptions(opts...); err != nil {
  425. return err
  426. }
  427. if i.master != "" {
  428. i.dstMaster = n.findDst(i.master, true)
  429. if i.dstMaster == "" {
  430. return fmt.Errorf("could not find an appropriate master %q for %q",
  431. i.master, i.srcName)
  432. }
  433. }
  434. if n.isDefault {
  435. i.dstName = i.srcName
  436. } else {
  437. links, err := n.nlHandle.LinkList()
  438. if err != nil {
  439. return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path)
  440. }
  441. // due to the docker network connect/disconnect, so the dstName should
  442. // restore from the namespace
  443. for _, link := range links {
  444. addrs, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4)
  445. if err != nil {
  446. return err
  447. }
  448. ifaceName := link.Attrs().Name
  449. if strings.HasPrefix(ifaceName, "vxlan") {
  450. if i.dstName == "vxlan" {
  451. i.dstName = ifaceName
  452. break
  453. }
  454. }
  455. // find the interface name by ip
  456. if i.address != nil {
  457. for _, addr := range addrs {
  458. if addr.IPNet.String() == i.address.String() {
  459. i.dstName = ifaceName
  460. break
  461. }
  462. continue
  463. }
  464. if i.dstName == ifaceName {
  465. break
  466. }
  467. }
  468. // This is to find the interface name of the pair in overlay sandbox
  469. if strings.HasPrefix(ifaceName, "veth") {
  470. if i.master != "" && i.dstName == "veth" {
  471. i.dstName = ifaceName
  472. }
  473. }
  474. }
  475. var index int
  476. indexStr := strings.TrimPrefix(i.dstName, name.DstPrefix)
  477. if indexStr != "" {
  478. index, err = strconv.Atoi(indexStr)
  479. if err != nil {
  480. return err
  481. }
  482. }
  483. index++
  484. n.Lock()
  485. if index > n.nextIfIndex[name.DstPrefix] {
  486. n.nextIfIndex[name.DstPrefix] = index
  487. }
  488. n.iFaces = append(n.iFaces, i)
  489. n.Unlock()
  490. }
  491. }
  492. // restore routes
  493. for _, r := range routes {
  494. n.Lock()
  495. n.staticRoutes = append(n.staticRoutes, r)
  496. n.Unlock()
  497. }
  498. // restore gateway
  499. if len(gw) > 0 {
  500. n.Lock()
  501. n.gw = gw
  502. n.Unlock()
  503. }
  504. if len(gw6) > 0 {
  505. n.Lock()
  506. n.gwv6 = gw6
  507. n.Unlock()
  508. }
  509. return nil
  510. }
  511. // Checks whether IPv6 needs to be enabled/disabled on the loopback interface
  512. func (n *networkNamespace) checkLoV6() {
  513. var (
  514. enable = false
  515. action = "disable"
  516. )
  517. n.Lock()
  518. for _, iface := range n.iFaces {
  519. if iface.AddressIPv6() != nil {
  520. enable = true
  521. action = "enable"
  522. break
  523. }
  524. }
  525. n.Unlock()
  526. if n.loV6Enabled == enable {
  527. return
  528. }
  529. if err := setIPv6(n.path, "lo", enable); err != nil {
  530. log.G(context.TODO()).Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err)
  531. }
  532. n.loV6Enabled = enable
  533. }
  534. // ApplyOSTweaks applies operating system specific knobs on the sandbox.
  535. func (n *networkNamespace) ApplyOSTweaks(types []SandboxType) {
  536. for _, t := range types {
  537. switch t {
  538. case SandboxTypeLoadBalancer, SandboxTypeIngress:
  539. kernel.ApplyOSTweaks(map[string]*kernel.OSValue{
  540. // disables any special handling on port reuse of existing IPVS connection table entries
  541. // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32
  542. "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil},
  543. // expires connection from the IPVS connection table when the backend is not available
  544. // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133
  545. "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil},
  546. // expires persistent connections to destination servers with weights set to 0
  547. // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151
  548. "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil},
  549. })
  550. }
  551. }
  552. }
  553. func setIPv6(nspath, iface string, enable bool) error {
  554. errCh := make(chan error, 1)
  555. go func() {
  556. defer close(errCh)
  557. namespace, err := netns.GetFromPath(nspath)
  558. if err != nil {
  559. errCh <- fmt.Errorf("failed get network namespace %q: %w", nspath, err)
  560. return
  561. }
  562. defer namespace.Close()
  563. runtime.LockOSThread()
  564. origNS, err := netns.Get()
  565. if err != nil {
  566. runtime.UnlockOSThread()
  567. errCh <- fmt.Errorf("failed to get current network namespace: %w", err)
  568. return
  569. }
  570. defer origNS.Close()
  571. if err = netns.Set(namespace); err != nil {
  572. runtime.UnlockOSThread()
  573. errCh <- fmt.Errorf("setting into container netns %q failed: %w", nspath, err)
  574. return
  575. }
  576. defer func() {
  577. if err := netns.Set(origNS); err != nil {
  578. log.G(context.TODO()).WithError(err).Error("libnetwork: restoring thread network namespace failed")
  579. // The error is only fatal for the current thread. Keep this
  580. // goroutine locked to the thread to make the runtime replace it
  581. // with a clean thread once this goroutine returns.
  582. } else {
  583. runtime.UnlockOSThread()
  584. }
  585. }()
  586. var (
  587. action = "disable"
  588. value = byte('1')
  589. path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", iface)
  590. )
  591. if enable {
  592. action = "enable"
  593. value = '0'
  594. }
  595. if _, err := os.Stat(path); err != nil {
  596. if os.IsNotExist(err) {
  597. log.G(context.TODO()).WithError(err).Warn("Cannot configure IPv6 forwarding on container interface. Has IPv6 been disabled in this node's kernel?")
  598. return
  599. }
  600. errCh <- err
  601. return
  602. }
  603. if err = os.WriteFile(path, []byte{value, '\n'}, 0o644); err != nil {
  604. errCh <- fmt.Errorf("failed to %s IPv6 forwarding for container's interface %s: %w", action, iface, err)
  605. return
  606. }
  607. }()
  608. return <-errCh
  609. }