namespace_linux.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705
  1. package osl
  2. import (
  3. "errors"
  4. "fmt"
  5. "net"
  6. "os"
  7. "os/exec"
  8. "path/filepath"
  9. "runtime"
  10. "strconv"
  11. "strings"
  12. "sync"
  13. "syscall"
  14. "time"
  15. "github.com/docker/docker/libnetwork/ns"
  16. "github.com/docker/docker/libnetwork/osl/kernel"
  17. "github.com/docker/docker/libnetwork/types"
  18. "github.com/docker/docker/pkg/reexec"
  19. "github.com/sirupsen/logrus"
  20. "github.com/vishvananda/netlink"
  21. "github.com/vishvananda/netns"
  22. "golang.org/x/sys/unix"
  23. )
  24. const defaultPrefix = "/var/run/docker"
  25. func init() {
  26. reexec.Register("set-ipv6", reexecSetIPv6)
  27. // Lock main() to the initial thread to exclude the goroutines spawned
  28. // by func (*networkNamespace) InvokeFunc() from being scheduled onto
  29. // that thread. Changes to the network namespace of the initial thread
  30. // alter /proc/self/ns/net, which would break any code which
  31. // (incorrectly) assumes that that file is a handle to the network
  32. // namespace for the thread it is currently executing on.
  33. runtime.LockOSThread()
  34. }
  35. var (
  36. once sync.Once
  37. garbagePathMap = make(map[string]bool)
  38. gpmLock sync.Mutex
  39. gpmWg sync.WaitGroup
  40. gpmCleanupPeriod = 60 * time.Second
  41. gpmChan = make(chan chan struct{})
  42. prefix = defaultPrefix
  43. )
  44. // The networkNamespace type is the linux implementation of the Sandbox
  45. // interface. It represents a linux network namespace, and moves an interface
  46. // into it when called on method AddInterface or sets the gateway etc.
  47. type networkNamespace struct {
  48. path string
  49. iFaces []*nwIface
  50. gw net.IP
  51. gwv6 net.IP
  52. staticRoutes []*types.StaticRoute
  53. neighbors []*neigh
  54. nextIfIndex map[string]int
  55. isDefault bool
  56. nlHandle *netlink.Handle
  57. loV6Enabled bool
  58. sync.Mutex
  59. }
  60. // SetBasePath sets the base url prefix for the ns path
  61. func SetBasePath(path string) {
  62. prefix = path
  63. }
  64. func init() {
  65. reexec.Register("netns-create", reexecCreateNamespace)
  66. }
  67. func basePath() string {
  68. return filepath.Join(prefix, "netns")
  69. }
  70. func createBasePath() {
  71. err := os.MkdirAll(basePath(), 0755)
  72. if err != nil {
  73. panic("Could not create net namespace path directory")
  74. }
  75. // Start the garbage collection go routine
  76. go removeUnusedPaths()
  77. }
  78. func removeUnusedPaths() {
  79. gpmLock.Lock()
  80. period := gpmCleanupPeriod
  81. gpmLock.Unlock()
  82. ticker := time.NewTicker(period)
  83. for {
  84. var (
  85. gc chan struct{}
  86. gcOk bool
  87. )
  88. select {
  89. case <-ticker.C:
  90. case gc, gcOk = <-gpmChan:
  91. }
  92. gpmLock.Lock()
  93. pathList := make([]string, 0, len(garbagePathMap))
  94. for path := range garbagePathMap {
  95. pathList = append(pathList, path)
  96. }
  97. garbagePathMap = make(map[string]bool)
  98. gpmWg.Add(1)
  99. gpmLock.Unlock()
  100. for _, path := range pathList {
  101. os.Remove(path)
  102. }
  103. gpmWg.Done()
  104. if gcOk {
  105. close(gc)
  106. }
  107. }
  108. }
  109. func addToGarbagePaths(path string) {
  110. gpmLock.Lock()
  111. garbagePathMap[path] = true
  112. gpmLock.Unlock()
  113. }
  114. func removeFromGarbagePaths(path string) {
  115. gpmLock.Lock()
  116. delete(garbagePathMap, path)
  117. gpmLock.Unlock()
  118. }
  119. // GC triggers garbage collection of namespace path right away
  120. // and waits for it.
  121. func GC() {
  122. gpmLock.Lock()
  123. if len(garbagePathMap) == 0 {
  124. // No need for GC if map is empty
  125. gpmLock.Unlock()
  126. return
  127. }
  128. gpmLock.Unlock()
  129. // if content exists in the garbage paths
  130. // we can trigger GC to run, providing a
  131. // channel to be notified on completion
  132. waitGC := make(chan struct{})
  133. gpmChan <- waitGC
  134. // wait for GC completion
  135. <-waitGC
  136. }
  137. // GenerateKey generates a sandbox key based on the passed
  138. // container id.
  139. func GenerateKey(containerID string) string {
  140. maxLen := 12
  141. // Read sandbox key from host for overlay
  142. if strings.HasPrefix(containerID, "-") {
  143. var (
  144. index int
  145. indexStr string
  146. tmpkey string
  147. )
  148. dir, err := os.ReadDir(basePath())
  149. if err != nil {
  150. return ""
  151. }
  152. for _, v := range dir {
  153. id := v.Name()
  154. if strings.HasSuffix(id, containerID[:maxLen-1]) {
  155. indexStr = strings.TrimSuffix(id, containerID[:maxLen-1])
  156. tmpindex, err := strconv.Atoi(indexStr)
  157. if err != nil {
  158. return ""
  159. }
  160. if tmpindex > index {
  161. index = tmpindex
  162. tmpkey = id
  163. }
  164. }
  165. }
  166. containerID = tmpkey
  167. if containerID == "" {
  168. return ""
  169. }
  170. }
  171. if len(containerID) < maxLen {
  172. maxLen = len(containerID)
  173. }
  174. return basePath() + "/" + containerID[:maxLen]
  175. }
  176. // NewSandbox provides a new sandbox instance created in an os specific way
  177. // provided a key which uniquely identifies the sandbox
  178. func NewSandbox(key string, osCreate, isRestore bool) (Sandbox, error) {
  179. if !isRestore {
  180. err := createNetworkNamespace(key, osCreate)
  181. if err != nil {
  182. return nil, err
  183. }
  184. } else {
  185. once.Do(createBasePath)
  186. }
  187. n := &networkNamespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)}
  188. sboxNs, err := netns.GetFromPath(n.path)
  189. if err != nil {
  190. return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
  191. }
  192. defer sboxNs.Close()
  193. n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
  194. if err != nil {
  195. return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
  196. }
  197. err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
  198. if err != nil {
  199. logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
  200. }
  201. // In live-restore mode, IPV6 entries are getting cleaned up due to below code
  202. // We should retain IPV6 configurations in live-restore mode when Docker Daemon
  203. // comes back. It should work as it is on other cases
  204. // As starting point, disable IPv6 on all interfaces
  205. if !isRestore && !n.isDefault {
  206. err = setIPv6(n.path, "all", false)
  207. if err != nil {
  208. logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
  209. }
  210. }
  211. if err = n.loopbackUp(); err != nil {
  212. n.nlHandle.Close()
  213. return nil, err
  214. }
  215. return n, nil
  216. }
  217. func (n *networkNamespace) InterfaceOptions() IfaceOptionSetter {
  218. return n
  219. }
  220. func (n *networkNamespace) NeighborOptions() NeighborOptionSetter {
  221. return n
  222. }
  223. func mountNetworkNamespace(basePath string, lnPath string) error {
  224. return syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "")
  225. }
  226. // GetSandboxForExternalKey returns sandbox object for the supplied path
  227. func GetSandboxForExternalKey(basePath string, key string) (Sandbox, error) {
  228. if err := createNamespaceFile(key); err != nil {
  229. return nil, err
  230. }
  231. if err := mountNetworkNamespace(basePath, key); err != nil {
  232. return nil, err
  233. }
  234. n := &networkNamespace{path: key, nextIfIndex: make(map[string]int)}
  235. sboxNs, err := netns.GetFromPath(n.path)
  236. if err != nil {
  237. return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
  238. }
  239. defer sboxNs.Close()
  240. n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
  241. if err != nil {
  242. return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
  243. }
  244. err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
  245. if err != nil {
  246. logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
  247. }
  248. // As starting point, disable IPv6 on all interfaces
  249. err = setIPv6(n.path, "all", false)
  250. if err != nil {
  251. logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
  252. }
  253. if err = n.loopbackUp(); err != nil {
  254. n.nlHandle.Close()
  255. return nil, err
  256. }
  257. return n, nil
  258. }
  259. func reexecCreateNamespace() {
  260. if len(os.Args) < 2 {
  261. logrus.Fatal("no namespace path provided")
  262. }
  263. if err := mountNetworkNamespace("/proc/self/ns/net", os.Args[1]); err != nil {
  264. logrus.Fatal(err)
  265. }
  266. }
  267. func createNetworkNamespace(path string, osCreate bool) error {
  268. if err := createNamespaceFile(path); err != nil {
  269. return err
  270. }
  271. cmd := &exec.Cmd{
  272. Path: reexec.Self(),
  273. Args: append([]string{"netns-create"}, path),
  274. Stdout: os.Stdout,
  275. Stderr: os.Stderr,
  276. }
  277. if osCreate {
  278. cmd.SysProcAttr = &syscall.SysProcAttr{}
  279. cmd.SysProcAttr.Cloneflags = syscall.CLONE_NEWNET
  280. }
  281. if err := cmd.Run(); err != nil {
  282. return fmt.Errorf("namespace creation reexec command failed: %v", err)
  283. }
  284. return nil
  285. }
  286. func unmountNamespaceFile(path string) {
  287. if _, err := os.Stat(path); err == nil {
  288. if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) {
  289. logrus.WithError(err).Error("Error unmounting namespace file")
  290. }
  291. }
  292. }
  293. func createNamespaceFile(path string) (err error) {
  294. var f *os.File
  295. once.Do(createBasePath)
  296. // Remove it from garbage collection list if present
  297. removeFromGarbagePaths(path)
  298. // If the path is there unmount it first
  299. unmountNamespaceFile(path)
  300. // wait for garbage collection to complete if it is in progress
  301. // before trying to create the file.
  302. gpmWg.Wait()
  303. if f, err = os.Create(path); err == nil {
  304. f.Close()
  305. }
  306. return err
  307. }
  308. func (n *networkNamespace) loopbackUp() error {
  309. iface, err := n.nlHandle.LinkByName("lo")
  310. if err != nil {
  311. return err
  312. }
  313. return n.nlHandle.LinkSetUp(iface)
  314. }
  315. func (n *networkNamespace) GetLoopbackIfaceName() string {
  316. return "lo"
  317. }
  318. func (n *networkNamespace) AddAliasIP(ifName string, ip *net.IPNet) error {
  319. iface, err := n.nlHandle.LinkByName(ifName)
  320. if err != nil {
  321. return err
  322. }
  323. return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip})
  324. }
  325. func (n *networkNamespace) RemoveAliasIP(ifName string, ip *net.IPNet) error {
  326. iface, err := n.nlHandle.LinkByName(ifName)
  327. if err != nil {
  328. return err
  329. }
  330. return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip})
  331. }
  332. func (n *networkNamespace) DisableARPForVIP(srcName string) (Err error) {
  333. dstName := ""
  334. for _, i := range n.Interfaces() {
  335. if i.SrcName() == srcName {
  336. dstName = i.DstName()
  337. break
  338. }
  339. }
  340. if dstName == "" {
  341. return fmt.Errorf("failed to find interface %s in sandbox", srcName)
  342. }
  343. err := n.InvokeFunc(func() {
  344. path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore")
  345. if err := os.WriteFile(path, []byte{'1', '\n'}, 0644); err != nil {
  346. Err = fmt.Errorf("Failed to set %s to 1: %v", path, err)
  347. return
  348. }
  349. path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce")
  350. if err := os.WriteFile(path, []byte{'2', '\n'}, 0644); err != nil {
  351. Err = fmt.Errorf("Failed to set %s to 2: %v", path, err)
  352. return
  353. }
  354. })
  355. if err != nil {
  356. return err
  357. }
  358. return
  359. }
  360. // InitOSContext initializes OS context while configuring network resources
  361. func InitOSContext() func() {
  362. return func() {}
  363. }
  364. func (n *networkNamespace) InvokeFunc(f func()) error {
  365. origNS, err := netns.Get()
  366. if err != nil {
  367. return fmt.Errorf("failed to get original network namespace: %w", err)
  368. }
  369. defer origNS.Close()
  370. path := n.nsPath()
  371. newNS, err := netns.GetFromPath(path)
  372. if err != nil {
  373. return fmt.Errorf("failed get network namespace %q: %w", path, err)
  374. }
  375. defer newNS.Close()
  376. done := make(chan error, 1)
  377. go func() {
  378. runtime.LockOSThread()
  379. if err := netns.Set(newNS); err != nil {
  380. runtime.UnlockOSThread()
  381. done <- err
  382. return
  383. }
  384. defer func() {
  385. close(done)
  386. if err := netns.Set(origNS); err != nil {
  387. logrus.WithError(err).Warn("failed to restore thread's network namespace")
  388. // Recover from the error by leaving this goroutine locked to
  389. // the thread. The runtime will terminate the thread and replace
  390. // it with a clean one when this goroutine returns.
  391. } else {
  392. runtime.UnlockOSThread()
  393. }
  394. }()
  395. f()
  396. }()
  397. return <-done
  398. }
  399. func (n *networkNamespace) nsPath() string {
  400. n.Lock()
  401. defer n.Unlock()
  402. return n.path
  403. }
  404. func (n *networkNamespace) Info() Info {
  405. return n
  406. }
  407. func (n *networkNamespace) Key() string {
  408. return n.path
  409. }
  410. func (n *networkNamespace) Destroy() error {
  411. if n.nlHandle != nil {
  412. n.nlHandle.Close()
  413. }
  414. // Assuming no running process is executing in this network namespace,
  415. // unmounting is sufficient to destroy it.
  416. if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil {
  417. return err
  418. }
  419. // Stash it into the garbage collection list
  420. addToGarbagePaths(n.path)
  421. return nil
  422. }
  423. // Restore restore the network namespace
  424. func (n *networkNamespace) Restore(ifsopt map[string][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error {
  425. // restore interfaces
  426. for name, opts := range ifsopt {
  427. if !strings.Contains(name, "+") {
  428. return fmt.Errorf("wrong iface name in restore osl sandbox interface: %s", name)
  429. }
  430. seps := strings.Split(name, "+")
  431. srcName := seps[0]
  432. dstPrefix := seps[1]
  433. i := &nwIface{srcName: srcName, dstName: dstPrefix, ns: n}
  434. i.processInterfaceOptions(opts...)
  435. if i.master != "" {
  436. i.dstMaster = n.findDst(i.master, true)
  437. if i.dstMaster == "" {
  438. return fmt.Errorf("could not find an appropriate master %q for %q",
  439. i.master, i.srcName)
  440. }
  441. }
  442. if n.isDefault {
  443. i.dstName = i.srcName
  444. } else {
  445. links, err := n.nlHandle.LinkList()
  446. if err != nil {
  447. return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path)
  448. }
  449. // due to the docker network connect/disconnect, so the dstName should
  450. // restore from the namespace
  451. for _, link := range links {
  452. addrs, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4)
  453. if err != nil {
  454. return err
  455. }
  456. ifaceName := link.Attrs().Name
  457. if strings.HasPrefix(ifaceName, "vxlan") {
  458. if i.dstName == "vxlan" {
  459. i.dstName = ifaceName
  460. break
  461. }
  462. }
  463. // find the interface name by ip
  464. if i.address != nil {
  465. for _, addr := range addrs {
  466. if addr.IPNet.String() == i.address.String() {
  467. i.dstName = ifaceName
  468. break
  469. }
  470. continue
  471. }
  472. if i.dstName == ifaceName {
  473. break
  474. }
  475. }
  476. // This is to find the interface name of the pair in overlay sandbox
  477. if strings.HasPrefix(ifaceName, "veth") {
  478. if i.master != "" && i.dstName == "veth" {
  479. i.dstName = ifaceName
  480. }
  481. }
  482. }
  483. var index int
  484. indexStr := strings.TrimPrefix(i.dstName, dstPrefix)
  485. if indexStr != "" {
  486. index, err = strconv.Atoi(indexStr)
  487. if err != nil {
  488. return err
  489. }
  490. }
  491. index++
  492. n.Lock()
  493. if index > n.nextIfIndex[dstPrefix] {
  494. n.nextIfIndex[dstPrefix] = index
  495. }
  496. n.iFaces = append(n.iFaces, i)
  497. n.Unlock()
  498. }
  499. }
  500. // restore routes
  501. for _, r := range routes {
  502. n.Lock()
  503. n.staticRoutes = append(n.staticRoutes, r)
  504. n.Unlock()
  505. }
  506. // restore gateway
  507. if len(gw) > 0 {
  508. n.Lock()
  509. n.gw = gw
  510. n.Unlock()
  511. }
  512. if len(gw6) > 0 {
  513. n.Lock()
  514. n.gwv6 = gw6
  515. n.Unlock()
  516. }
  517. return nil
  518. }
  519. // Checks whether IPv6 needs to be enabled/disabled on the loopback interface
  520. func (n *networkNamespace) checkLoV6() {
  521. var (
  522. enable = false
  523. action = "disable"
  524. )
  525. n.Lock()
  526. for _, iface := range n.iFaces {
  527. if iface.AddressIPv6() != nil {
  528. enable = true
  529. action = "enable"
  530. break
  531. }
  532. }
  533. n.Unlock()
  534. if n.loV6Enabled == enable {
  535. return
  536. }
  537. if err := setIPv6(n.path, "lo", enable); err != nil {
  538. logrus.Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err)
  539. }
  540. n.loV6Enabled = enable
  541. }
  542. func reexecSetIPv6() {
  543. runtime.LockOSThread()
  544. defer runtime.UnlockOSThread()
  545. if len(os.Args) < 3 {
  546. logrus.Errorf("invalid number of arguments for %s", os.Args[0])
  547. os.Exit(1)
  548. }
  549. ns, err := netns.GetFromPath(os.Args[1])
  550. if err != nil {
  551. logrus.Errorf("failed get network namespace %q: %v", os.Args[1], err)
  552. os.Exit(2)
  553. }
  554. defer ns.Close()
  555. if err = netns.Set(ns); err != nil {
  556. logrus.Errorf("setting into container netns %q failed: %v", os.Args[1], err)
  557. os.Exit(3)
  558. }
  559. var (
  560. action = "disable"
  561. value = byte('1')
  562. path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", os.Args[2])
  563. )
  564. if os.Args[3] == "true" {
  565. action = "enable"
  566. value = byte('0')
  567. }
  568. if _, err := os.Stat(path); err != nil {
  569. if os.IsNotExist(err) {
  570. logrus.Warnf("file does not exist: %s : %v Has IPv6 been disabled in this node's kernel?", path, err)
  571. os.Exit(0)
  572. }
  573. logrus.Errorf("failed to stat %s : %v", path, err)
  574. os.Exit(5)
  575. }
  576. if err = os.WriteFile(path, []byte{value, '\n'}, 0644); err != nil {
  577. logrus.Errorf("failed to %s IPv6 forwarding for container's interface %s: %v", action, os.Args[2], err)
  578. os.Exit(4)
  579. }
  580. os.Exit(0)
  581. }
  582. func setIPv6(path, iface string, enable bool) error {
  583. cmd := &exec.Cmd{
  584. Path: reexec.Self(),
  585. Args: append([]string{"set-ipv6"}, path, iface, strconv.FormatBool(enable)),
  586. Stdout: os.Stdout,
  587. Stderr: os.Stderr,
  588. }
  589. if err := cmd.Run(); err != nil {
  590. return fmt.Errorf("reexec to set IPv6 failed: %v", err)
  591. }
  592. return nil
  593. }
  594. // ApplyOSTweaks applies linux configs on the sandbox
  595. func (n *networkNamespace) ApplyOSTweaks(types []SandboxType) {
  596. for _, t := range types {
  597. switch t {
  598. case SandboxTypeLoadBalancer, SandboxTypeIngress:
  599. kernel.ApplyOSTweaks(map[string]*kernel.OSValue{
  600. // disables any special handling on port reuse of existing IPVS connection table entries
  601. // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L32
  602. "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil},
  603. // expires connection from the IPVS connection table when the backend is not available
  604. // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L133
  605. "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil},
  606. // expires persistent connections to destination servers with weights set to 0
  607. // more info: https://github.com/torvalds/linux/blame/v5.15/Documentation/networking/ipvs-sysctl.rst#L151
  608. "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil},
  609. })
  610. }
  611. }
  612. }