namespace_linux.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685
  1. package osl
  2. import (
  3. "context"
  4. "errors"
  5. "fmt"
  6. "net"
  7. "os"
  8. "path/filepath"
  9. "runtime"
  10. "strconv"
  11. "strings"
  12. "sync"
  13. "syscall"
  14. "time"
  15. "github.com/containerd/containerd/log"
  16. "github.com/docker/docker/internal/unshare"
  17. "github.com/docker/docker/libnetwork/ns"
  18. "github.com/docker/docker/libnetwork/osl/kernel"
  19. "github.com/docker/docker/libnetwork/types"
  20. "github.com/vishvananda/netlink"
  21. "github.com/vishvananda/netns"
  22. "golang.org/x/sys/unix"
  23. )
  24. const defaultPrefix = "/var/run/docker"
  25. func init() {
  26. // Lock main() to the initial thread to exclude the goroutines spawned
  27. // by func (*networkNamespace) InvokeFunc() or func setIPv6() below from
  28. // being scheduled onto that thread. Changes to the network namespace of
  29. // the initial thread alter /proc/self/ns/net, which would break any
  30. // code which (incorrectly) assumes that that file is the network
  31. // namespace for the thread it is currently executing on.
  32. runtime.LockOSThread()
  33. }
  34. var (
  35. once sync.Once
  36. garbagePathMap = make(map[string]bool)
  37. gpmLock sync.Mutex
  38. gpmWg sync.WaitGroup
  39. gpmCleanupPeriod = 60 * time.Second
  40. gpmChan = make(chan chan struct{})
  41. prefix = defaultPrefix
  42. )
  43. // The networkNamespace type is the linux implementation of the Sandbox
  44. // interface. It represents a linux network namespace, and moves an interface
  45. // into it when called on method AddInterface or sets the gateway etc.
  46. type networkNamespace struct {
  47. path string
  48. iFaces []*nwIface
  49. gw net.IP
  50. gwv6 net.IP
  51. staticRoutes []*types.StaticRoute
  52. neighbors []*neigh
  53. nextIfIndex map[string]int
  54. isDefault bool
  55. nlHandle *netlink.Handle
  56. loV6Enabled bool
  57. sync.Mutex
  58. }
  59. // SetBasePath sets the base url prefix for the ns path
  60. func SetBasePath(path string) {
  61. prefix = path
  62. }
  63. func basePath() string {
  64. return filepath.Join(prefix, "netns")
  65. }
  66. func createBasePath() {
  67. err := os.MkdirAll(basePath(), 0o755)
  68. if err != nil {
  69. panic("Could not create net namespace path directory")
  70. }
  71. // Start the garbage collection go routine
  72. go removeUnusedPaths()
  73. }
  74. func removeUnusedPaths() {
  75. gpmLock.Lock()
  76. period := gpmCleanupPeriod
  77. gpmLock.Unlock()
  78. ticker := time.NewTicker(period)
  79. for {
  80. var (
  81. gc chan struct{}
  82. gcOk bool
  83. )
  84. select {
  85. case <-ticker.C:
  86. case gc, gcOk = <-gpmChan:
  87. }
  88. gpmLock.Lock()
  89. pathList := make([]string, 0, len(garbagePathMap))
  90. for path := range garbagePathMap {
  91. pathList = append(pathList, path)
  92. }
  93. garbagePathMap = make(map[string]bool)
  94. gpmWg.Add(1)
  95. gpmLock.Unlock()
  96. for _, path := range pathList {
  97. os.Remove(path)
  98. }
  99. gpmWg.Done()
  100. if gcOk {
  101. close(gc)
  102. }
  103. }
  104. }
  105. func addToGarbagePaths(path string) {
  106. gpmLock.Lock()
  107. garbagePathMap[path] = true
  108. gpmLock.Unlock()
  109. }
  110. func removeFromGarbagePaths(path string) {
  111. gpmLock.Lock()
  112. delete(garbagePathMap, path)
  113. gpmLock.Unlock()
  114. }
  115. // GC triggers garbage collection of namespace path right away
  116. // and waits for it.
  117. func GC() {
  118. gpmLock.Lock()
  119. if len(garbagePathMap) == 0 {
  120. // No need for GC if map is empty
  121. gpmLock.Unlock()
  122. return
  123. }
  124. gpmLock.Unlock()
  125. // if content exists in the garbage paths
  126. // we can trigger GC to run, providing a
  127. // channel to be notified on completion
  128. waitGC := make(chan struct{})
  129. gpmChan <- waitGC
  130. // wait for GC completion
  131. <-waitGC
  132. }
  133. // GenerateKey generates a sandbox key based on the passed
  134. // container id.
  135. func GenerateKey(containerID string) string {
  136. maxLen := 12
  137. // Read sandbox key from host for overlay
  138. if strings.HasPrefix(containerID, "-") {
  139. var (
  140. index int
  141. indexStr string
  142. tmpkey string
  143. )
  144. dir, err := os.ReadDir(basePath())
  145. if err != nil {
  146. return ""
  147. }
  148. for _, v := range dir {
  149. id := v.Name()
  150. if strings.HasSuffix(id, containerID[:maxLen-1]) {
  151. indexStr = strings.TrimSuffix(id, containerID[:maxLen-1])
  152. tmpindex, err := strconv.Atoi(indexStr)
  153. if err != nil {
  154. return ""
  155. }
  156. if tmpindex > index {
  157. index = tmpindex
  158. tmpkey = id
  159. }
  160. }
  161. }
  162. containerID = tmpkey
  163. if containerID == "" {
  164. return ""
  165. }
  166. }
  167. if len(containerID) < maxLen {
  168. maxLen = len(containerID)
  169. }
  170. return basePath() + "/" + containerID[:maxLen]
  171. }
  172. // NewSandbox provides a new sandbox instance created in an os specific way
  173. // provided a key which uniquely identifies the sandbox
  174. func NewSandbox(key string, osCreate, isRestore bool) (Sandbox, error) {
  175. if !isRestore {
  176. err := createNetworkNamespace(key, osCreate)
  177. if err != nil {
  178. return nil, err
  179. }
  180. } else {
  181. once.Do(createBasePath)
  182. }
  183. n := &networkNamespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)}
  184. sboxNs, err := netns.GetFromPath(n.path)
  185. if err != nil {
  186. return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
  187. }
  188. defer sboxNs.Close()
  189. n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
  190. if err != nil {
  191. return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
  192. }
  193. err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
  194. if err != nil {
  195. log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
  196. }
  197. // In live-restore mode, IPV6 entries are getting cleaned up due to below code
  198. // We should retain IPV6 configurations in live-restore mode when Docker Daemon
  199. // comes back. It should work as it is on other cases
  200. // As starting point, disable IPv6 on all interfaces
  201. if !isRestore && !n.isDefault {
  202. err = setIPv6(n.path, "all", false)
  203. if err != nil {
  204. log.G(context.TODO()).Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
  205. }
  206. }
  207. if err = n.loopbackUp(); err != nil {
  208. n.nlHandle.Close()
  209. return nil, err
  210. }
  211. return n, nil
  212. }
  213. func (n *networkNamespace) InterfaceOptions() IfaceOptionSetter {
  214. return n
  215. }
  216. func (n *networkNamespace) NeighborOptions() NeighborOptionSetter {
  217. return n
  218. }
  219. func mountNetworkNamespace(basePath string, lnPath string) error {
  220. return syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "")
  221. }
  222. // GetSandboxForExternalKey returns sandbox object for the supplied path
  223. func GetSandboxForExternalKey(basePath string, key string) (Sandbox, error) {
  224. if err := createNamespaceFile(key); err != nil {
  225. return nil, err
  226. }
  227. if err := mountNetworkNamespace(basePath, key); err != nil {
  228. return nil, err
  229. }
  230. n := &networkNamespace{path: key, nextIfIndex: make(map[string]int)}
  231. sboxNs, err := netns.GetFromPath(n.path)
  232. if err != nil {
  233. return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
  234. }
  235. defer sboxNs.Close()
  236. n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
  237. if err != nil {
  238. return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
  239. }
  240. err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
  241. if err != nil {
  242. log.G(context.TODO()).Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
  243. }
  244. // As starting point, disable IPv6 on all interfaces
  245. err = setIPv6(n.path, "all", false)
  246. if err != nil {
  247. log.G(context.TODO()).Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
  248. }
  249. if err = n.loopbackUp(); err != nil {
  250. n.nlHandle.Close()
  251. return nil, err
  252. }
  253. return n, nil
  254. }
  255. func createNetworkNamespace(path string, osCreate bool) error {
  256. if err := createNamespaceFile(path); err != nil {
  257. return err
  258. }
  259. do := func() error {
  260. return mountNetworkNamespace(fmt.Sprintf("/proc/self/task/%d/ns/net", unix.Gettid()), path)
  261. }
  262. if osCreate {
  263. return unshare.Go(unix.CLONE_NEWNET, do, nil)
  264. }
  265. return do()
  266. }
  267. func unmountNamespaceFile(path string) {
  268. if _, err := os.Stat(path); err == nil {
  269. if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) {
  270. log.G(context.TODO()).WithError(err).Error("Error unmounting namespace file")
  271. }
  272. }
  273. }
  274. func createNamespaceFile(path string) (err error) {
  275. var f *os.File
  276. once.Do(createBasePath)
  277. // Remove it from garbage collection list if present
  278. removeFromGarbagePaths(path)
  279. // If the path is there unmount it first
  280. unmountNamespaceFile(path)
  281. // wait for garbage collection to complete if it is in progress
  282. // before trying to create the file.
  283. gpmWg.Wait()
  284. if f, err = os.Create(path); err == nil {
  285. f.Close()
  286. }
  287. return err
  288. }
  289. func (n *networkNamespace) loopbackUp() error {
  290. iface, err := n.nlHandle.LinkByName("lo")
  291. if err != nil {
  292. return err
  293. }
  294. return n.nlHandle.LinkSetUp(iface)
  295. }
  296. func (n *networkNamespace) GetLoopbackIfaceName() string {
  297. return "lo"
  298. }
  299. func (n *networkNamespace) AddAliasIP(ifName string, ip *net.IPNet) error {
  300. iface, err := n.nlHandle.LinkByName(ifName)
  301. if err != nil {
  302. return err
  303. }
  304. return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip})
  305. }
  306. func (n *networkNamespace) RemoveAliasIP(ifName string, ip *net.IPNet) error {
  307. iface, err := n.nlHandle.LinkByName(ifName)
  308. if err != nil {
  309. return err
  310. }
  311. return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip})
  312. }
  313. func (n *networkNamespace) DisableARPForVIP(srcName string) (Err error) {
  314. dstName := ""
  315. for _, i := range n.Interfaces() {
  316. if i.SrcName() == srcName {
  317. dstName = i.DstName()
  318. break
  319. }
  320. }
  321. if dstName == "" {
  322. return fmt.Errorf("failed to find interface %s in sandbox", srcName)
  323. }
  324. err := n.InvokeFunc(func() {
  325. path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore")
  326. if err := os.WriteFile(path, []byte{'1', '\n'}, 0o644); err != nil {
  327. Err = fmt.Errorf("Failed to set %s to 1: %v", path, err)
  328. return
  329. }
  330. path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce")
  331. if err := os.WriteFile(path, []byte{'2', '\n'}, 0o644); err != nil {
  332. Err = fmt.Errorf("Failed to set %s to 2: %v", path, err)
  333. return
  334. }
  335. })
  336. if err != nil {
  337. return err
  338. }
  339. return
  340. }
  341. func (n *networkNamespace) InvokeFunc(f func()) error {
  342. path := n.nsPath()
  343. newNS, err := netns.GetFromPath(path)
  344. if err != nil {
  345. return fmt.Errorf("failed get network namespace %q: %w", path, err)
  346. }
  347. defer newNS.Close()
  348. done := make(chan error, 1)
  349. go func() {
  350. runtime.LockOSThread()
  351. // InvokeFunc() could have been called from a goroutine with
  352. // tampered thread state, e.g. from another InvokeFunc()
  353. // callback. The outer goroutine's thread state cannot be
  354. // trusted.
  355. origNS, err := netns.Get()
  356. if err != nil {
  357. runtime.UnlockOSThread()
  358. done <- fmt.Errorf("failed to get original network namespace: %w", err)
  359. return
  360. }
  361. defer origNS.Close()
  362. if err := netns.Set(newNS); err != nil {
  363. runtime.UnlockOSThread()
  364. done <- err
  365. return
  366. }
  367. defer func() {
  368. close(done)
  369. if err := netns.Set(origNS); err != nil {
  370. log.G(context.TODO()).WithError(err).Warn("failed to restore thread's network namespace")
  371. // Recover from the error by leaving this goroutine locked to
  372. // the thread. The runtime will terminate the thread and replace
  373. // it with a clean one when this goroutine returns.
  374. } else {
  375. runtime.UnlockOSThread()
  376. }
  377. }()
  378. f()
  379. }()
  380. return <-done
  381. }
  382. func (n *networkNamespace) nsPath() string {
  383. n.Lock()
  384. defer n.Unlock()
  385. return n.path
  386. }
  387. func (n *networkNamespace) Info() Info {
  388. return n
  389. }
  390. func (n *networkNamespace) Key() string {
  391. return n.path
  392. }
  393. func (n *networkNamespace) Destroy() error {
  394. if n.nlHandle != nil {
  395. n.nlHandle.Close()
  396. }
  397. // Assuming no running process is executing in this network namespace,
  398. // unmounting is sufficient to destroy it.
  399. if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil {
  400. return err
  401. }
  402. // Stash it into the garbage collection list
  403. addToGarbagePaths(n.path)
  404. return nil
  405. }
  406. // Restore restore the network namespace
  407. func (n *networkNamespace) Restore(ifsopt map[Iface][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error {
  408. // restore interfaces
  409. for name, opts := range ifsopt {
  410. i := &nwIface{
  411. srcName: name.SrcName,
  412. dstName: name.DstPrefix,
  413. ns: n,
  414. }
  415. i.processInterfaceOptions(opts...)
  416. if i.master != "" {
  417. i.dstMaster = n.findDst(i.master, true)
  418. if i.dstMaster == "" {
  419. return fmt.Errorf("could not find an appropriate master %q for %q",
  420. i.master, i.srcName)
  421. }
  422. }
  423. if n.isDefault {
  424. i.dstName = i.srcName
  425. } else {
  426. links, err := n.nlHandle.LinkList()
  427. if err != nil {
  428. return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path)
  429. }
  430. // due to the docker network connect/disconnect, so the dstName should
  431. // restore from the namespace
  432. for _, link := range links {
  433. addrs, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4)
  434. if err != nil {
  435. return err
  436. }
  437. ifaceName := link.Attrs().Name
  438. if strings.HasPrefix(ifaceName, "vxlan") {
  439. if i.dstName == "vxlan" {
  440. i.dstName = ifaceName
  441. break
  442. }
  443. }
  444. // find the interface name by ip
  445. if i.address != nil {
  446. for _, addr := range addrs {
  447. if addr.IPNet.String() == i.address.String() {
  448. i.dstName = ifaceName
  449. break
  450. }
  451. continue
  452. }
  453. if i.dstName == ifaceName {
  454. break
  455. }
  456. }
  457. // This is to find the interface name of the pair in overlay sandbox
  458. if strings.HasPrefix(ifaceName, "veth") {
  459. if i.master != "" && i.dstName == "veth" {
  460. i.dstName = ifaceName
  461. }
  462. }
  463. }
  464. var index int
  465. indexStr := strings.TrimPrefix(i.dstName, name.DstPrefix)
  466. if indexStr != "" {
  467. index, err = strconv.Atoi(indexStr)
  468. if err != nil {
  469. return err
  470. }
  471. }
  472. index++
  473. n.Lock()
  474. if index > n.nextIfIndex[name.DstPrefix] {
  475. n.nextIfIndex[name.DstPrefix] = index
  476. }
  477. n.iFaces = append(n.iFaces, i)
  478. n.Unlock()
  479. }
  480. }
  481. // restore routes
  482. for _, r := range routes {
  483. n.Lock()
  484. n.staticRoutes = append(n.staticRoutes, r)
  485. n.Unlock()
  486. }
  487. // restore gateway
  488. if len(gw) > 0 {
  489. n.Lock()
  490. n.gw = gw
  491. n.Unlock()
  492. }
  493. if len(gw6) > 0 {
  494. n.Lock()
  495. n.gwv6 = gw6
  496. n.Unlock()
  497. }
  498. return nil
  499. }
  500. // Checks whether IPv6 needs to be enabled/disabled on the loopback interface
  501. func (n *networkNamespace) checkLoV6() {
  502. var (
  503. enable = false
  504. action = "disable"
  505. )
  506. n.Lock()
  507. for _, iface := range n.iFaces {
  508. if iface.AddressIPv6() != nil {
  509. enable = true
  510. action = "enable"
  511. break
  512. }
  513. }
  514. n.Unlock()
  515. if n.loV6Enabled == enable {
  516. return
  517. }
  518. if err := setIPv6(n.path, "lo", enable); err != nil {
  519. log.G(context.TODO()).Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err)
  520. }
  521. n.loV6Enabled = enable
  522. }
  523. func setIPv6(nspath, iface string, enable bool) error {
  524. errCh := make(chan error, 1)
  525. go func() {
  526. defer close(errCh)
  527. namespace, err := netns.GetFromPath(nspath)
  528. if err != nil {
  529. errCh <- fmt.Errorf("failed get network namespace %q: %w", nspath, err)
  530. return
  531. }
  532. defer namespace.Close()
  533. runtime.LockOSThread()
  534. origNS, err := netns.Get()
  535. if err != nil {
  536. runtime.UnlockOSThread()
  537. errCh <- fmt.Errorf("failed to get current network namespace: %w", err)
  538. return
  539. }
  540. defer origNS.Close()
  541. if err = netns.Set(namespace); err != nil {
  542. runtime.UnlockOSThread()
  543. errCh <- fmt.Errorf("setting into container netns %q failed: %w", nspath, err)
  544. return
  545. }
  546. defer func() {
  547. if err := netns.Set(origNS); err != nil {
  548. log.G(context.TODO()).WithError(err).Error("libnetwork: restoring thread network namespace failed")
  549. // The error is only fatal for the current thread. Keep this
  550. // goroutine locked to the thread to make the runtime replace it
  551. // with a clean thread once this goroutine returns.
  552. } else {
  553. runtime.UnlockOSThread()
  554. }
  555. }()
  556. var (
  557. action = "disable"
  558. value = byte('1')
  559. path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", iface)
  560. )
  561. if enable {
  562. action = "enable"
  563. value = '0'
  564. }
  565. if _, err := os.Stat(path); err != nil {
  566. if os.IsNotExist(err) {
  567. log.G(context.TODO()).WithError(err).Warn("Cannot configure IPv6 forwarding on container interface. Has IPv6 been disabled in this node's kernel?")
  568. return
  569. }
  570. errCh <- err
  571. return
  572. }
  573. if err = os.WriteFile(path, []byte{value, '\n'}, 0o644); err != nil {
  574. errCh <- fmt.Errorf("failed to %s IPv6 forwarding for container's interface %s: %w", action, iface, err)
  575. return
  576. }
  577. }()
  578. return <-errCh
  579. }
  580. // ApplyOSTweaks applies linux configs on the sandbox
  581. func (n *networkNamespace) ApplyOSTweaks(types []SandboxType) {
  582. for _, t := range types {
  583. switch t {
  584. case SandboxTypeLoadBalancer, SandboxTypeIngress:
  585. kernel.ApplyOSTweaks(map[string]*kernel.OSValue{
  586. // disables any special handling on port reuse of existing IPVS connection table entries
  587. // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32
  588. "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil},
  589. // expires connection from the IPVS connection table when the backend is not available
  590. // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133
  591. "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil},
  592. // expires persistent connections to destination servers with weights set to 0
  593. // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151
  594. "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil},
  595. })
  596. }
  597. }
  598. }