namespace_linux.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686
  1. package osl
  2. import (
  3. "errors"
  4. "fmt"
  5. "net"
  6. "os"
  7. "path/filepath"
  8. "runtime"
  9. "strconv"
  10. "strings"
  11. "sync"
  12. "syscall"
  13. "time"
  14. "github.com/docker/docker/internal/unshare"
  15. "github.com/docker/docker/libnetwork/ns"
  16. "github.com/docker/docker/libnetwork/osl/kernel"
  17. "github.com/docker/docker/libnetwork/types"
  18. "github.com/sirupsen/logrus"
  19. "github.com/vishvananda/netlink"
  20. "github.com/vishvananda/netns"
  21. "golang.org/x/sys/unix"
  22. )
  23. const defaultPrefix = "/var/run/docker"
  24. func init() {
  25. // Lock main() to the initial thread to exclude the goroutines spawned
  26. // by func (*networkNamespace) InvokeFunc() or func setIPv6() below from
  27. // being scheduled onto that thread. Changes to the network namespace of
  28. // the initial thread alter /proc/self/ns/net, which would break any
  29. // code which (incorrectly) assumes that that file is the network
  30. // namespace for the thread it is currently executing on.
  31. runtime.LockOSThread()
  32. }
  33. var (
  34. once sync.Once
  35. garbagePathMap = make(map[string]bool)
  36. gpmLock sync.Mutex
  37. gpmWg sync.WaitGroup
  38. gpmCleanupPeriod = 60 * time.Second
  39. gpmChan = make(chan chan struct{})
  40. prefix = defaultPrefix
  41. )
  42. // The networkNamespace type is the linux implementation of the Sandbox
  43. // interface. It represents a linux network namespace, and moves an interface
  44. // into it when called on method AddInterface or sets the gateway etc.
  45. type networkNamespace struct {
  46. path string
  47. iFaces []*nwIface
  48. gw net.IP
  49. gwv6 net.IP
  50. staticRoutes []*types.StaticRoute
  51. neighbors []*neigh
  52. nextIfIndex map[string]int
  53. isDefault bool
  54. nlHandle *netlink.Handle
  55. loV6Enabled bool
  56. sync.Mutex
  57. }
  58. // SetBasePath sets the base url prefix for the ns path
  59. func SetBasePath(path string) {
  60. prefix = path
  61. }
  62. func basePath() string {
  63. return filepath.Join(prefix, "netns")
  64. }
  65. func createBasePath() {
  66. err := os.MkdirAll(basePath(), 0755)
  67. if err != nil {
  68. panic("Could not create net namespace path directory")
  69. }
  70. // Start the garbage collection go routine
  71. go removeUnusedPaths()
  72. }
  73. func removeUnusedPaths() {
  74. gpmLock.Lock()
  75. period := gpmCleanupPeriod
  76. gpmLock.Unlock()
  77. ticker := time.NewTicker(period)
  78. for {
  79. var (
  80. gc chan struct{}
  81. gcOk bool
  82. )
  83. select {
  84. case <-ticker.C:
  85. case gc, gcOk = <-gpmChan:
  86. }
  87. gpmLock.Lock()
  88. pathList := make([]string, 0, len(garbagePathMap))
  89. for path := range garbagePathMap {
  90. pathList = append(pathList, path)
  91. }
  92. garbagePathMap = make(map[string]bool)
  93. gpmWg.Add(1)
  94. gpmLock.Unlock()
  95. for _, path := range pathList {
  96. os.Remove(path)
  97. }
  98. gpmWg.Done()
  99. if gcOk {
  100. close(gc)
  101. }
  102. }
  103. }
  104. func addToGarbagePaths(path string) {
  105. gpmLock.Lock()
  106. garbagePathMap[path] = true
  107. gpmLock.Unlock()
  108. }
  109. func removeFromGarbagePaths(path string) {
  110. gpmLock.Lock()
  111. delete(garbagePathMap, path)
  112. gpmLock.Unlock()
  113. }
  114. // GC triggers garbage collection of namespace path right away
  115. // and waits for it.
  116. func GC() {
  117. gpmLock.Lock()
  118. if len(garbagePathMap) == 0 {
  119. // No need for GC if map is empty
  120. gpmLock.Unlock()
  121. return
  122. }
  123. gpmLock.Unlock()
  124. // if content exists in the garbage paths
  125. // we can trigger GC to run, providing a
  126. // channel to be notified on completion
  127. waitGC := make(chan struct{})
  128. gpmChan <- waitGC
  129. // wait for GC completion
  130. <-waitGC
  131. }
  132. // GenerateKey generates a sandbox key based on the passed
  133. // container id.
  134. func GenerateKey(containerID string) string {
  135. maxLen := 12
  136. // Read sandbox key from host for overlay
  137. if strings.HasPrefix(containerID, "-") {
  138. var (
  139. index int
  140. indexStr string
  141. tmpkey string
  142. )
  143. dir, err := os.ReadDir(basePath())
  144. if err != nil {
  145. return ""
  146. }
  147. for _, v := range dir {
  148. id := v.Name()
  149. if strings.HasSuffix(id, containerID[:maxLen-1]) {
  150. indexStr = strings.TrimSuffix(id, containerID[:maxLen-1])
  151. tmpindex, err := strconv.Atoi(indexStr)
  152. if err != nil {
  153. return ""
  154. }
  155. if tmpindex > index {
  156. index = tmpindex
  157. tmpkey = id
  158. }
  159. }
  160. }
  161. containerID = tmpkey
  162. if containerID == "" {
  163. return ""
  164. }
  165. }
  166. if len(containerID) < maxLen {
  167. maxLen = len(containerID)
  168. }
  169. return basePath() + "/" + containerID[:maxLen]
  170. }
  171. // NewSandbox provides a new sandbox instance created in an os specific way
  172. // provided a key which uniquely identifies the sandbox
  173. func NewSandbox(key string, osCreate, isRestore bool) (Sandbox, error) {
  174. if !isRestore {
  175. err := createNetworkNamespace(key, osCreate)
  176. if err != nil {
  177. return nil, err
  178. }
  179. } else {
  180. once.Do(createBasePath)
  181. }
  182. n := &networkNamespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)}
  183. sboxNs, err := netns.GetFromPath(n.path)
  184. if err != nil {
  185. return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
  186. }
  187. defer sboxNs.Close()
  188. n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
  189. if err != nil {
  190. return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
  191. }
  192. err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
  193. if err != nil {
  194. logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
  195. }
  196. // In live-restore mode, IPV6 entries are getting cleaned up due to below code
  197. // We should retain IPV6 configurations in live-restore mode when Docker Daemon
  198. // comes back. It should work as it is on other cases
  199. // As starting point, disable IPv6 on all interfaces
  200. if !isRestore && !n.isDefault {
  201. err = setIPv6(n.path, "all", false)
  202. if err != nil {
  203. logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
  204. }
  205. }
  206. if err = n.loopbackUp(); err != nil {
  207. n.nlHandle.Close()
  208. return nil, err
  209. }
  210. return n, nil
  211. }
  212. func (n *networkNamespace) InterfaceOptions() IfaceOptionSetter {
  213. return n
  214. }
  215. func (n *networkNamespace) NeighborOptions() NeighborOptionSetter {
  216. return n
  217. }
  218. func mountNetworkNamespace(basePath string, lnPath string) error {
  219. return syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "")
  220. }
  221. // GetSandboxForExternalKey returns sandbox object for the supplied path
  222. func GetSandboxForExternalKey(basePath string, key string) (Sandbox, error) {
  223. if err := createNamespaceFile(key); err != nil {
  224. return nil, err
  225. }
  226. if err := mountNetworkNamespace(basePath, key); err != nil {
  227. return nil, err
  228. }
  229. n := &networkNamespace{path: key, nextIfIndex: make(map[string]int)}
  230. sboxNs, err := netns.GetFromPath(n.path)
  231. if err != nil {
  232. return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
  233. }
  234. defer sboxNs.Close()
  235. n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
  236. if err != nil {
  237. return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
  238. }
  239. err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
  240. if err != nil {
  241. logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
  242. }
  243. // As starting point, disable IPv6 on all interfaces
  244. err = setIPv6(n.path, "all", false)
  245. if err != nil {
  246. logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
  247. }
  248. if err = n.loopbackUp(); err != nil {
  249. n.nlHandle.Close()
  250. return nil, err
  251. }
  252. return n, nil
  253. }
  254. func createNetworkNamespace(path string, osCreate bool) error {
  255. if err := createNamespaceFile(path); err != nil {
  256. return err
  257. }
  258. do := func() error {
  259. return mountNetworkNamespace(fmt.Sprintf("/proc/self/task/%d/ns/net", unix.Gettid()), path)
  260. }
  261. if osCreate {
  262. return unshare.Go(unix.CLONE_NEWNET, do, nil)
  263. }
  264. return do()
  265. }
  266. func unmountNamespaceFile(path string) {
  267. if _, err := os.Stat(path); err == nil {
  268. if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) {
  269. logrus.WithError(err).Error("Error unmounting namespace file")
  270. }
  271. }
  272. }
  273. func createNamespaceFile(path string) (err error) {
  274. var f *os.File
  275. once.Do(createBasePath)
  276. // Remove it from garbage collection list if present
  277. removeFromGarbagePaths(path)
  278. // If the path is there unmount it first
  279. unmountNamespaceFile(path)
  280. // wait for garbage collection to complete if it is in progress
  281. // before trying to create the file.
  282. gpmWg.Wait()
  283. if f, err = os.Create(path); err == nil {
  284. f.Close()
  285. }
  286. return err
  287. }
  288. func (n *networkNamespace) loopbackUp() error {
  289. iface, err := n.nlHandle.LinkByName("lo")
  290. if err != nil {
  291. return err
  292. }
  293. return n.nlHandle.LinkSetUp(iface)
  294. }
  295. func (n *networkNamespace) GetLoopbackIfaceName() string {
  296. return "lo"
  297. }
  298. func (n *networkNamespace) AddAliasIP(ifName string, ip *net.IPNet) error {
  299. iface, err := n.nlHandle.LinkByName(ifName)
  300. if err != nil {
  301. return err
  302. }
  303. return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip})
  304. }
  305. func (n *networkNamespace) RemoveAliasIP(ifName string, ip *net.IPNet) error {
  306. iface, err := n.nlHandle.LinkByName(ifName)
  307. if err != nil {
  308. return err
  309. }
  310. return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip})
  311. }
  312. func (n *networkNamespace) DisableARPForVIP(srcName string) (Err error) {
  313. dstName := ""
  314. for _, i := range n.Interfaces() {
  315. if i.SrcName() == srcName {
  316. dstName = i.DstName()
  317. break
  318. }
  319. }
  320. if dstName == "" {
  321. return fmt.Errorf("failed to find interface %s in sandbox", srcName)
  322. }
  323. err := n.InvokeFunc(func() {
  324. path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore")
  325. if err := os.WriteFile(path, []byte{'1', '\n'}, 0644); err != nil {
  326. Err = fmt.Errorf("Failed to set %s to 1: %v", path, err)
  327. return
  328. }
  329. path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce")
  330. if err := os.WriteFile(path, []byte{'2', '\n'}, 0644); err != nil {
  331. Err = fmt.Errorf("Failed to set %s to 2: %v", path, err)
  332. return
  333. }
  334. })
  335. if err != nil {
  336. return err
  337. }
  338. return
  339. }
  340. func (n *networkNamespace) InvokeFunc(f func()) error {
  341. path := n.nsPath()
  342. newNS, err := netns.GetFromPath(path)
  343. if err != nil {
  344. return fmt.Errorf("failed get network namespace %q: %w", path, err)
  345. }
  346. defer newNS.Close()
  347. done := make(chan error, 1)
  348. go func() {
  349. runtime.LockOSThread()
  350. // InvokeFunc() could have been called from a goroutine with
  351. // tampered thread state, e.g. from another InvokeFunc()
  352. // callback. The outer goroutine's thread state cannot be
  353. // trusted.
  354. origNS, err := netns.Get()
  355. if err != nil {
  356. runtime.UnlockOSThread()
  357. done <- fmt.Errorf("failed to get original network namespace: %w", err)
  358. return
  359. }
  360. defer origNS.Close()
  361. if err := netns.Set(newNS); err != nil {
  362. runtime.UnlockOSThread()
  363. done <- err
  364. return
  365. }
  366. defer func() {
  367. close(done)
  368. if err := netns.Set(origNS); err != nil {
  369. logrus.WithError(err).Warn("failed to restore thread's network namespace")
  370. // Recover from the error by leaving this goroutine locked to
  371. // the thread. The runtime will terminate the thread and replace
  372. // it with a clean one when this goroutine returns.
  373. } else {
  374. runtime.UnlockOSThread()
  375. }
  376. }()
  377. f()
  378. }()
  379. return <-done
  380. }
  381. func (n *networkNamespace) nsPath() string {
  382. n.Lock()
  383. defer n.Unlock()
  384. return n.path
  385. }
  386. func (n *networkNamespace) Info() Info {
  387. return n
  388. }
  389. func (n *networkNamespace) Key() string {
  390. return n.path
  391. }
  392. func (n *networkNamespace) Destroy() error {
  393. if n.nlHandle != nil {
  394. n.nlHandle.Close()
  395. }
  396. // Assuming no running process is executing in this network namespace,
  397. // unmounting is sufficient to destroy it.
  398. if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil {
  399. return err
  400. }
  401. // Stash it into the garbage collection list
  402. addToGarbagePaths(n.path)
  403. return nil
  404. }
  405. // Restore restore the network namespace
  406. func (n *networkNamespace) Restore(ifsopt map[string][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error {
  407. // restore interfaces
  408. for name, opts := range ifsopt {
  409. if !strings.Contains(name, "+") {
  410. return fmt.Errorf("wrong iface name in restore osl sandbox interface: %s", name)
  411. }
  412. seps := strings.Split(name, "+")
  413. srcName := seps[0]
  414. dstPrefix := seps[1]
  415. i := &nwIface{srcName: srcName, dstName: dstPrefix, ns: n}
  416. i.processInterfaceOptions(opts...)
  417. if i.master != "" {
  418. i.dstMaster = n.findDst(i.master, true)
  419. if i.dstMaster == "" {
  420. return fmt.Errorf("could not find an appropriate master %q for %q",
  421. i.master, i.srcName)
  422. }
  423. }
  424. if n.isDefault {
  425. i.dstName = i.srcName
  426. } else {
  427. links, err := n.nlHandle.LinkList()
  428. if err != nil {
  429. return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path)
  430. }
  431. // due to the docker network connect/disconnect, so the dstName should
  432. // restore from the namespace
  433. for _, link := range links {
  434. addrs, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4)
  435. if err != nil {
  436. return err
  437. }
  438. ifaceName := link.Attrs().Name
  439. if strings.HasPrefix(ifaceName, "vxlan") {
  440. if i.dstName == "vxlan" {
  441. i.dstName = ifaceName
  442. break
  443. }
  444. }
  445. // find the interface name by ip
  446. if i.address != nil {
  447. for _, addr := range addrs {
  448. if addr.IPNet.String() == i.address.String() {
  449. i.dstName = ifaceName
  450. break
  451. }
  452. continue
  453. }
  454. if i.dstName == ifaceName {
  455. break
  456. }
  457. }
  458. // This is to find the interface name of the pair in overlay sandbox
  459. if strings.HasPrefix(ifaceName, "veth") {
  460. if i.master != "" && i.dstName == "veth" {
  461. i.dstName = ifaceName
  462. }
  463. }
  464. }
  465. var index int
  466. indexStr := strings.TrimPrefix(i.dstName, dstPrefix)
  467. if indexStr != "" {
  468. index, err = strconv.Atoi(indexStr)
  469. if err != nil {
  470. return err
  471. }
  472. }
  473. index++
  474. n.Lock()
  475. if index > n.nextIfIndex[dstPrefix] {
  476. n.nextIfIndex[dstPrefix] = index
  477. }
  478. n.iFaces = append(n.iFaces, i)
  479. n.Unlock()
  480. }
  481. }
  482. // restore routes
  483. for _, r := range routes {
  484. n.Lock()
  485. n.staticRoutes = append(n.staticRoutes, r)
  486. n.Unlock()
  487. }
  488. // restore gateway
  489. if len(gw) > 0 {
  490. n.Lock()
  491. n.gw = gw
  492. n.Unlock()
  493. }
  494. if len(gw6) > 0 {
  495. n.Lock()
  496. n.gwv6 = gw6
  497. n.Unlock()
  498. }
  499. return nil
  500. }
  501. // Checks whether IPv6 needs to be enabled/disabled on the loopback interface
  502. func (n *networkNamespace) checkLoV6() {
  503. var (
  504. enable = false
  505. action = "disable"
  506. )
  507. n.Lock()
  508. for _, iface := range n.iFaces {
  509. if iface.AddressIPv6() != nil {
  510. enable = true
  511. action = "enable"
  512. break
  513. }
  514. }
  515. n.Unlock()
  516. if n.loV6Enabled == enable {
  517. return
  518. }
  519. if err := setIPv6(n.path, "lo", enable); err != nil {
  520. logrus.Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err)
  521. }
  522. n.loV6Enabled = enable
  523. }
  524. func setIPv6(nspath, iface string, enable bool) error {
  525. errCh := make(chan error, 1)
  526. go func() {
  527. defer close(errCh)
  528. namespace, err := netns.GetFromPath(nspath)
  529. if err != nil {
  530. errCh <- fmt.Errorf("failed get network namespace %q: %w", nspath, err)
  531. return
  532. }
  533. defer namespace.Close()
  534. runtime.LockOSThread()
  535. origNS, err := netns.Get()
  536. if err != nil {
  537. runtime.UnlockOSThread()
  538. errCh <- fmt.Errorf("failed to get current network namespace: %w", err)
  539. return
  540. }
  541. defer origNS.Close()
  542. if err = netns.Set(namespace); err != nil {
  543. runtime.UnlockOSThread()
  544. errCh <- fmt.Errorf("setting into container netns %q failed: %w", nspath, err)
  545. return
  546. }
  547. defer func() {
  548. if err := netns.Set(origNS); err != nil {
  549. logrus.WithError(err).Error("libnetwork: restoring thread network namespace failed")
  550. // The error is only fatal for the current thread. Keep this
  551. // goroutine locked to the thread to make the runtime replace it
  552. // with a clean thread once this goroutine returns.
  553. } else {
  554. runtime.UnlockOSThread()
  555. }
  556. }()
  557. var (
  558. action = "disable"
  559. value = byte('1')
  560. path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", iface)
  561. )
  562. if enable {
  563. action = "enable"
  564. value = '0'
  565. }
  566. if _, err := os.Stat(path); err != nil {
  567. if os.IsNotExist(err) {
  568. logrus.WithError(err).Warn("Cannot configure IPv6 forwarding on container interface. Has IPv6 been disabled in this node's kernel?")
  569. return
  570. }
  571. errCh <- err
  572. return
  573. }
  574. if err = os.WriteFile(path, []byte{value, '\n'}, 0o644); err != nil {
  575. errCh <- fmt.Errorf("failed to %s IPv6 forwarding for container's interface %s: %w", action, iface, err)
  576. return
  577. }
  578. }()
  579. return <-errCh
  580. }
  581. // ApplyOSTweaks applies linux configs on the sandbox
  582. func (n *networkNamespace) ApplyOSTweaks(types []SandboxType) {
  583. for _, t := range types {
  584. switch t {
  585. case SandboxTypeLoadBalancer, SandboxTypeIngress:
  586. kernel.ApplyOSTweaks(map[string]*kernel.OSValue{
  587. // disables any special handling on port reuse of existing IPVS connection table entries
  588. // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32
  589. "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil},
  590. // expires connection from the IPVS connection table when the backend is not available
  591. // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133
  592. "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil},
  593. // expires persistent connections to destination servers with weights set to 0
  594. // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151
  595. "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil},
  596. })
  597. }
  598. }
  599. }