namespace_linux.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696
  1. package osl
  2. import (
  3. "errors"
  4. "fmt"
  5. "net"
  6. "os"
  7. "os/exec"
  8. "path/filepath"
  9. "runtime"
  10. "strconv"
  11. "strings"
  12. "sync"
  13. "syscall"
  14. "time"
  15. "github.com/docker/docker/libnetwork/ns"
  16. "github.com/docker/docker/libnetwork/osl/kernel"
  17. "github.com/docker/docker/libnetwork/types"
  18. "github.com/docker/docker/pkg/reexec"
  19. "github.com/sirupsen/logrus"
  20. "github.com/vishvananda/netlink"
  21. "github.com/vishvananda/netns"
  22. "golang.org/x/sys/unix"
  23. )
  24. const defaultPrefix = "/var/run/docker"
  25. func init() {
  26. reexec.Register("set-ipv6", reexecSetIPv6)
  27. }
  28. var (
  29. once sync.Once
  30. garbagePathMap = make(map[string]bool)
  31. gpmLock sync.Mutex
  32. gpmWg sync.WaitGroup
  33. gpmCleanupPeriod = 60 * time.Second
  34. gpmChan = make(chan chan struct{})
  35. prefix = defaultPrefix
  36. loadBalancerConfig = map[string]*kernel.OSValue{
  37. // disables any special handling on port reuse of existing IPVS connection table entries
  38. // more info: https://github.com/torvalds/linux/blob/master/Documentation/networking/ipvs-sysctl.txt#L25:1
  39. "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil},
  40. // expires connection from the IPVS connection table when the backend is not available
  41. // more info: https://github.com/torvalds/linux/blob/master/Documentation/networking/ipvs-sysctl.txt#L126:1
  42. "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil},
  43. // expires persistent connections to destination servers with weights set to 0
  44. // more info: https://github.com/torvalds/linux/blob/master/Documentation/networking/ipvs-sysctl.txt#L144:1
  45. "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil},
  46. }
  47. )
  48. // The networkNamespace type is the linux implementation of the Sandbox
  49. // interface. It represents a linux network namespace, and moves an interface
  50. // into it when called on method AddInterface or sets the gateway etc.
  51. type networkNamespace struct {
  52. path string
  53. iFaces []*nwIface
  54. gw net.IP
  55. gwv6 net.IP
  56. staticRoutes []*types.StaticRoute
  57. neighbors []*neigh
  58. nextIfIndex map[string]int
  59. isDefault bool
  60. nlHandle *netlink.Handle
  61. loV6Enabled bool
  62. sync.Mutex
  63. }
  64. // SetBasePath sets the base url prefix for the ns path
  65. func SetBasePath(path string) {
  66. prefix = path
  67. }
  68. func init() {
  69. reexec.Register("netns-create", reexecCreateNamespace)
  70. }
  71. func basePath() string {
  72. return filepath.Join(prefix, "netns")
  73. }
  74. func createBasePath() {
  75. err := os.MkdirAll(basePath(), 0755)
  76. if err != nil {
  77. panic("Could not create net namespace path directory")
  78. }
  79. // Start the garbage collection go routine
  80. go removeUnusedPaths()
  81. }
  82. func removeUnusedPaths() {
  83. gpmLock.Lock()
  84. period := gpmCleanupPeriod
  85. gpmLock.Unlock()
  86. ticker := time.NewTicker(period)
  87. for {
  88. var (
  89. gc chan struct{}
  90. gcOk bool
  91. )
  92. select {
  93. case <-ticker.C:
  94. case gc, gcOk = <-gpmChan:
  95. }
  96. gpmLock.Lock()
  97. pathList := make([]string, 0, len(garbagePathMap))
  98. for path := range garbagePathMap {
  99. pathList = append(pathList, path)
  100. }
  101. garbagePathMap = make(map[string]bool)
  102. gpmWg.Add(1)
  103. gpmLock.Unlock()
  104. for _, path := range pathList {
  105. os.Remove(path)
  106. }
  107. gpmWg.Done()
  108. if gcOk {
  109. close(gc)
  110. }
  111. }
  112. }
  113. func addToGarbagePaths(path string) {
  114. gpmLock.Lock()
  115. garbagePathMap[path] = true
  116. gpmLock.Unlock()
  117. }
  118. func removeFromGarbagePaths(path string) {
  119. gpmLock.Lock()
  120. delete(garbagePathMap, path)
  121. gpmLock.Unlock()
  122. }
  123. // GC triggers garbage collection of namespace path right away
  124. // and waits for it.
  125. func GC() {
  126. gpmLock.Lock()
  127. if len(garbagePathMap) == 0 {
  128. // No need for GC if map is empty
  129. gpmLock.Unlock()
  130. return
  131. }
  132. gpmLock.Unlock()
  133. // if content exists in the garbage paths
  134. // we can trigger GC to run, providing a
  135. // channel to be notified on completion
  136. waitGC := make(chan struct{})
  137. gpmChan <- waitGC
  138. // wait for GC completion
  139. <-waitGC
  140. }
  141. // GenerateKey generates a sandbox key based on the passed
  142. // container id.
  143. func GenerateKey(containerID string) string {
  144. maxLen := 12
  145. // Read sandbox key from host for overlay
  146. if strings.HasPrefix(containerID, "-") {
  147. var (
  148. index int
  149. indexStr string
  150. tmpkey string
  151. )
  152. dir, err := os.ReadDir(basePath())
  153. if err != nil {
  154. return ""
  155. }
  156. for _, v := range dir {
  157. id := v.Name()
  158. if strings.HasSuffix(id, containerID[:maxLen-1]) {
  159. indexStr = strings.TrimSuffix(id, containerID[:maxLen-1])
  160. tmpindex, err := strconv.Atoi(indexStr)
  161. if err != nil {
  162. return ""
  163. }
  164. if tmpindex > index {
  165. index = tmpindex
  166. tmpkey = id
  167. }
  168. }
  169. }
  170. containerID = tmpkey
  171. if containerID == "" {
  172. return ""
  173. }
  174. }
  175. if len(containerID) < maxLen {
  176. maxLen = len(containerID)
  177. }
  178. return basePath() + "/" + containerID[:maxLen]
  179. }
  180. // NewSandbox provides a new sandbox instance created in an os specific way
  181. // provided a key which uniquely identifies the sandbox
  182. func NewSandbox(key string, osCreate, isRestore bool) (Sandbox, error) {
  183. if !isRestore {
  184. err := createNetworkNamespace(key, osCreate)
  185. if err != nil {
  186. return nil, err
  187. }
  188. } else {
  189. once.Do(createBasePath)
  190. }
  191. n := &networkNamespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)}
  192. sboxNs, err := netns.GetFromPath(n.path)
  193. if err != nil {
  194. return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
  195. }
  196. defer sboxNs.Close()
  197. n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
  198. if err != nil {
  199. return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
  200. }
  201. err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
  202. if err != nil {
  203. logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
  204. }
  205. // In live-restore mode, IPV6 entries are getting cleaned up due to below code
  206. // We should retain IPV6 configurations in live-restore mode when Docker Daemon
  207. // comes back. It should work as it is on other cases
  208. // As starting point, disable IPv6 on all interfaces
  209. if !isRestore && !n.isDefault {
  210. err = setIPv6(n.path, "all", false)
  211. if err != nil {
  212. logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
  213. }
  214. }
  215. if err = n.loopbackUp(); err != nil {
  216. n.nlHandle.Delete()
  217. return nil, err
  218. }
  219. return n, nil
  220. }
  221. func (n *networkNamespace) InterfaceOptions() IfaceOptionSetter {
  222. return n
  223. }
  224. func (n *networkNamespace) NeighborOptions() NeighborOptionSetter {
  225. return n
  226. }
  227. func mountNetworkNamespace(basePath string, lnPath string) error {
  228. return syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "")
  229. }
  230. // GetSandboxForExternalKey returns sandbox object for the supplied path
  231. func GetSandboxForExternalKey(basePath string, key string) (Sandbox, error) {
  232. if err := createNamespaceFile(key); err != nil {
  233. return nil, err
  234. }
  235. if err := mountNetworkNamespace(basePath, key); err != nil {
  236. return nil, err
  237. }
  238. n := &networkNamespace{path: key, nextIfIndex: make(map[string]int)}
  239. sboxNs, err := netns.GetFromPath(n.path)
  240. if err != nil {
  241. return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
  242. }
  243. defer sboxNs.Close()
  244. n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
  245. if err != nil {
  246. return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
  247. }
  248. err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
  249. if err != nil {
  250. logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
  251. }
  252. // As starting point, disable IPv6 on all interfaces
  253. err = setIPv6(n.path, "all", false)
  254. if err != nil {
  255. logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
  256. }
  257. if err = n.loopbackUp(); err != nil {
  258. n.nlHandle.Delete()
  259. return nil, err
  260. }
  261. return n, nil
  262. }
  263. func reexecCreateNamespace() {
  264. if len(os.Args) < 2 {
  265. logrus.Fatal("no namespace path provided")
  266. }
  267. if err := mountNetworkNamespace("/proc/self/ns/net", os.Args[1]); err != nil {
  268. logrus.Fatal(err)
  269. }
  270. }
  271. func createNetworkNamespace(path string, osCreate bool) error {
  272. if err := createNamespaceFile(path); err != nil {
  273. return err
  274. }
  275. cmd := &exec.Cmd{
  276. Path: reexec.Self(),
  277. Args: append([]string{"netns-create"}, path),
  278. Stdout: os.Stdout,
  279. Stderr: os.Stderr,
  280. }
  281. if osCreate {
  282. cmd.SysProcAttr = &syscall.SysProcAttr{}
  283. cmd.SysProcAttr.Cloneflags = syscall.CLONE_NEWNET
  284. }
  285. if err := cmd.Run(); err != nil {
  286. return fmt.Errorf("namespace creation reexec command failed: %v", err)
  287. }
  288. return nil
  289. }
  290. func unmountNamespaceFile(path string) {
  291. if _, err := os.Stat(path); err == nil {
  292. if err := syscall.Unmount(path, syscall.MNT_DETACH); err != nil && !errors.Is(err, unix.EINVAL) {
  293. logrus.WithError(err).Error("Error unmounting namespace file")
  294. }
  295. }
  296. }
  297. func createNamespaceFile(path string) (err error) {
  298. var f *os.File
  299. once.Do(createBasePath)
  300. // Remove it from garbage collection list if present
  301. removeFromGarbagePaths(path)
  302. // If the path is there unmount it first
  303. unmountNamespaceFile(path)
  304. // wait for garbage collection to complete if it is in progress
  305. // before trying to create the file.
  306. gpmWg.Wait()
  307. if f, err = os.Create(path); err == nil {
  308. f.Close()
  309. }
  310. return err
  311. }
  312. func (n *networkNamespace) loopbackUp() error {
  313. iface, err := n.nlHandle.LinkByName("lo")
  314. if err != nil {
  315. return err
  316. }
  317. return n.nlHandle.LinkSetUp(iface)
  318. }
  319. func (n *networkNamespace) GetLoopbackIfaceName() string {
  320. return "lo"
  321. }
  322. func (n *networkNamespace) AddAliasIP(ifName string, ip *net.IPNet) error {
  323. iface, err := n.nlHandle.LinkByName(ifName)
  324. if err != nil {
  325. return err
  326. }
  327. return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip})
  328. }
  329. func (n *networkNamespace) RemoveAliasIP(ifName string, ip *net.IPNet) error {
  330. iface, err := n.nlHandle.LinkByName(ifName)
  331. if err != nil {
  332. return err
  333. }
  334. return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip})
  335. }
  336. func (n *networkNamespace) DisableARPForVIP(srcName string) (Err error) {
  337. dstName := ""
  338. for _, i := range n.Interfaces() {
  339. if i.SrcName() == srcName {
  340. dstName = i.DstName()
  341. break
  342. }
  343. }
  344. if dstName == "" {
  345. return fmt.Errorf("failed to find interface %s in sandbox", srcName)
  346. }
  347. err := n.InvokeFunc(func() {
  348. path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore")
  349. if err := os.WriteFile(path, []byte{'1', '\n'}, 0644); err != nil {
  350. Err = fmt.Errorf("Failed to set %s to 1: %v", path, err)
  351. return
  352. }
  353. path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce")
  354. if err := os.WriteFile(path, []byte{'2', '\n'}, 0644); err != nil {
  355. Err = fmt.Errorf("Failed to set %s to 2: %v", path, err)
  356. return
  357. }
  358. })
  359. if err != nil {
  360. return err
  361. }
  362. return
  363. }
  364. func (n *networkNamespace) InvokeFunc(f func()) error {
  365. return nsInvoke(n.nsPath(), func(nsFD int) error { return nil }, func(callerFD int) error {
  366. f()
  367. return nil
  368. })
  369. }
  370. // InitOSContext initializes OS context while configuring network resources
  371. func InitOSContext() func() {
  372. runtime.LockOSThread()
  373. if err := ns.SetNamespace(); err != nil {
  374. logrus.Error(err)
  375. }
  376. return runtime.UnlockOSThread
  377. }
  378. func nsInvoke(path string, prefunc func(nsFD int) error, postfunc func(callerFD int) error) error {
  379. defer InitOSContext()()
  380. newNs, err := netns.GetFromPath(path)
  381. if err != nil {
  382. return fmt.Errorf("failed get network namespace %q: %v", path, err)
  383. }
  384. defer newNs.Close()
  385. // Invoked before the namespace switch happens but after the namespace file
  386. // handle is obtained.
  387. if err := prefunc(int(newNs)); err != nil {
  388. return fmt.Errorf("failed in prefunc: %v", err)
  389. }
  390. if err = netns.Set(newNs); err != nil {
  391. return err
  392. }
  393. defer ns.SetNamespace()
  394. // Invoked after the namespace switch.
  395. return postfunc(ns.ParseHandlerInt())
  396. }
  397. func (n *networkNamespace) nsPath() string {
  398. n.Lock()
  399. defer n.Unlock()
  400. return n.path
  401. }
  402. func (n *networkNamespace) Info() Info {
  403. return n
  404. }
  405. func (n *networkNamespace) Key() string {
  406. return n.path
  407. }
  408. func (n *networkNamespace) Destroy() error {
  409. if n.nlHandle != nil {
  410. n.nlHandle.Delete()
  411. }
  412. // Assuming no running process is executing in this network namespace,
  413. // unmounting is sufficient to destroy it.
  414. if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil {
  415. return err
  416. }
  417. // Stash it into the garbage collection list
  418. addToGarbagePaths(n.path)
  419. return nil
  420. }
  421. // Restore restore the network namespace
  422. func (n *networkNamespace) Restore(ifsopt map[string][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error {
  423. // restore interfaces
  424. for name, opts := range ifsopt {
  425. if !strings.Contains(name, "+") {
  426. return fmt.Errorf("wrong iface name in restore osl sandbox interface: %s", name)
  427. }
  428. seps := strings.Split(name, "+")
  429. srcName := seps[0]
  430. dstPrefix := seps[1]
  431. i := &nwIface{srcName: srcName, dstName: dstPrefix, ns: n}
  432. i.processInterfaceOptions(opts...)
  433. if i.master != "" {
  434. i.dstMaster = n.findDst(i.master, true)
  435. if i.dstMaster == "" {
  436. return fmt.Errorf("could not find an appropriate master %q for %q",
  437. i.master, i.srcName)
  438. }
  439. }
  440. if n.isDefault {
  441. i.dstName = i.srcName
  442. } else {
  443. links, err := n.nlHandle.LinkList()
  444. if err != nil {
  445. return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path)
  446. }
  447. // due to the docker network connect/disconnect, so the dstName should
  448. // restore from the namespace
  449. for _, link := range links {
  450. addrs, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4)
  451. if err != nil {
  452. return err
  453. }
  454. ifaceName := link.Attrs().Name
  455. if strings.HasPrefix(ifaceName, "vxlan") {
  456. if i.dstName == "vxlan" {
  457. i.dstName = ifaceName
  458. break
  459. }
  460. }
  461. // find the interface name by ip
  462. if i.address != nil {
  463. for _, addr := range addrs {
  464. if addr.IPNet.String() == i.address.String() {
  465. i.dstName = ifaceName
  466. break
  467. }
  468. continue
  469. }
  470. if i.dstName == ifaceName {
  471. break
  472. }
  473. }
  474. // This is to find the interface name of the pair in overlay sandbox
  475. if strings.HasPrefix(ifaceName, "veth") {
  476. if i.master != "" && i.dstName == "veth" {
  477. i.dstName = ifaceName
  478. }
  479. }
  480. }
  481. var index int
  482. indexStr := strings.TrimPrefix(i.dstName, dstPrefix)
  483. if indexStr != "" {
  484. index, err = strconv.Atoi(indexStr)
  485. if err != nil {
  486. return err
  487. }
  488. }
  489. index++
  490. n.Lock()
  491. if index > n.nextIfIndex[dstPrefix] {
  492. n.nextIfIndex[dstPrefix] = index
  493. }
  494. n.iFaces = append(n.iFaces, i)
  495. n.Unlock()
  496. }
  497. }
  498. // restore routes
  499. for _, r := range routes {
  500. n.Lock()
  501. n.staticRoutes = append(n.staticRoutes, r)
  502. n.Unlock()
  503. }
  504. // restore gateway
  505. if len(gw) > 0 {
  506. n.Lock()
  507. n.gw = gw
  508. n.Unlock()
  509. }
  510. if len(gw6) > 0 {
  511. n.Lock()
  512. n.gwv6 = gw6
  513. n.Unlock()
  514. }
  515. return nil
  516. }
  517. // Checks whether IPv6 needs to be enabled/disabled on the loopback interface
  518. func (n *networkNamespace) checkLoV6() {
  519. var (
  520. enable = false
  521. action = "disable"
  522. )
  523. n.Lock()
  524. for _, iface := range n.iFaces {
  525. if iface.AddressIPv6() != nil {
  526. enable = true
  527. action = "enable"
  528. break
  529. }
  530. }
  531. n.Unlock()
  532. if n.loV6Enabled == enable {
  533. return
  534. }
  535. if err := setIPv6(n.path, "lo", enable); err != nil {
  536. logrus.Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err)
  537. }
  538. n.loV6Enabled = enable
  539. }
  540. func reexecSetIPv6() {
  541. runtime.LockOSThread()
  542. defer runtime.UnlockOSThread()
  543. if len(os.Args) < 3 {
  544. logrus.Errorf("invalid number of arguments for %s", os.Args[0])
  545. os.Exit(1)
  546. }
  547. ns, err := netns.GetFromPath(os.Args[1])
  548. if err != nil {
  549. logrus.Errorf("failed get network namespace %q: %v", os.Args[1], err)
  550. os.Exit(2)
  551. }
  552. defer ns.Close()
  553. if err = netns.Set(ns); err != nil {
  554. logrus.Errorf("setting into container netns %q failed: %v", os.Args[1], err)
  555. os.Exit(3)
  556. }
  557. var (
  558. action = "disable"
  559. value = byte('1')
  560. path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", os.Args[2])
  561. )
  562. if os.Args[3] == "true" {
  563. action = "enable"
  564. value = byte('0')
  565. }
  566. if _, err := os.Stat(path); err != nil {
  567. if os.IsNotExist(err) {
  568. logrus.Warnf("file does not exist: %s : %v Has IPv6 been disabled in this node's kernel?", path, err)
  569. os.Exit(0)
  570. }
  571. logrus.Errorf("failed to stat %s : %v", path, err)
  572. os.Exit(5)
  573. }
  574. if err = os.WriteFile(path, []byte{value, '\n'}, 0644); err != nil {
  575. logrus.Errorf("failed to %s IPv6 forwarding for container's interface %s: %v", action, os.Args[2], err)
  576. os.Exit(4)
  577. }
  578. os.Exit(0)
  579. }
  580. func setIPv6(path, iface string, enable bool) error {
  581. cmd := &exec.Cmd{
  582. Path: reexec.Self(),
  583. Args: append([]string{"set-ipv6"}, path, iface, strconv.FormatBool(enable)),
  584. Stdout: os.Stdout,
  585. Stderr: os.Stderr,
  586. }
  587. if err := cmd.Run(); err != nil {
  588. return fmt.Errorf("reexec to set IPv6 failed: %v", err)
  589. }
  590. return nil
  591. }
  592. // ApplyOSTweaks applies linux configs on the sandbox
  593. func (n *networkNamespace) ApplyOSTweaks(types []SandboxType) {
  594. for _, t := range types {
  595. switch t {
  596. case SandboxTypeLoadBalancer:
  597. kernel.ApplyOSTweaks(loadBalancerConfig)
  598. }
  599. }
  600. }