namespace_linux.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693
  1. package osl
  2. import (
  3. "fmt"
  4. "io/ioutil"
  5. "net"
  6. "os"
  7. "os/exec"
  8. "path/filepath"
  9. "runtime"
  10. "strconv"
  11. "strings"
  12. "sync"
  13. "syscall"
  14. "time"
  15. "github.com/docker/docker/pkg/reexec"
  16. "github.com/docker/libnetwork/ns"
  17. "github.com/docker/libnetwork/osl/kernel"
  18. "github.com/docker/libnetwork/types"
  19. "github.com/sirupsen/logrus"
  20. "github.com/vishvananda/netlink"
  21. "github.com/vishvananda/netns"
  22. )
  23. const defaultPrefix = "/var/run/docker"
  24. func init() {
  25. reexec.Register("set-ipv6", reexecSetIPv6)
  26. }
  27. var (
  28. once sync.Once
  29. garbagePathMap = make(map[string]bool)
  30. gpmLock sync.Mutex
  31. gpmWg sync.WaitGroup
  32. gpmCleanupPeriod = 60 * time.Second
  33. gpmChan = make(chan chan struct{})
  34. prefix = defaultPrefix
  35. loadBalancerConfig = map[string]*kernel.OSValue{
  36. // disables any special handling on port reuse of existing IPVS connection table entries
  37. // more info: https://github.com/torvalds/linux/blob/master/Documentation/networking/ipvs-sysctl.txt#L25:1
  38. "net.ipv4.vs.conn_reuse_mode": {Value: "0", CheckFn: nil},
  39. // expires connection from the IPVS connection table when the backend is not available
  40. // more info: https://github.com/torvalds/linux/blob/master/Documentation/networking/ipvs-sysctl.txt#L126:1
  41. "net.ipv4.vs.expire_nodest_conn": {Value: "1", CheckFn: nil},
  42. // expires persistent connections to destination servers with weights set to 0
  43. // more info: https://github.com/torvalds/linux/blob/master/Documentation/networking/ipvs-sysctl.txt#L144:1
  44. "net.ipv4.vs.expire_quiescent_template": {Value: "1", CheckFn: nil},
  45. }
  46. )
  47. // The networkNamespace type is the linux implementation of the Sandbox
  48. // interface. It represents a linux network namespace, and moves an interface
  49. // into it when called on method AddInterface or sets the gateway etc.
  50. type networkNamespace struct {
  51. path string
  52. iFaces []*nwIface
  53. gw net.IP
  54. gwv6 net.IP
  55. staticRoutes []*types.StaticRoute
  56. neighbors []*neigh
  57. nextIfIndex map[string]int
  58. isDefault bool
  59. nlHandle *netlink.Handle
  60. loV6Enabled bool
  61. sync.Mutex
  62. }
  63. // SetBasePath sets the base url prefix for the ns path
  64. func SetBasePath(path string) {
  65. prefix = path
  66. }
  67. func init() {
  68. reexec.Register("netns-create", reexecCreateNamespace)
  69. }
  70. func basePath() string {
  71. return filepath.Join(prefix, "netns")
  72. }
  73. func createBasePath() {
  74. err := os.MkdirAll(basePath(), 0755)
  75. if err != nil {
  76. panic("Could not create net namespace path directory")
  77. }
  78. // Start the garbage collection go routine
  79. go removeUnusedPaths()
  80. }
  81. func removeUnusedPaths() {
  82. gpmLock.Lock()
  83. period := gpmCleanupPeriod
  84. gpmLock.Unlock()
  85. ticker := time.NewTicker(period)
  86. for {
  87. var (
  88. gc chan struct{}
  89. gcOk bool
  90. )
  91. select {
  92. case <-ticker.C:
  93. case gc, gcOk = <-gpmChan:
  94. }
  95. gpmLock.Lock()
  96. pathList := make([]string, 0, len(garbagePathMap))
  97. for path := range garbagePathMap {
  98. pathList = append(pathList, path)
  99. }
  100. garbagePathMap = make(map[string]bool)
  101. gpmWg.Add(1)
  102. gpmLock.Unlock()
  103. for _, path := range pathList {
  104. os.Remove(path)
  105. }
  106. gpmWg.Done()
  107. if gcOk {
  108. close(gc)
  109. }
  110. }
  111. }
  112. func addToGarbagePaths(path string) {
  113. gpmLock.Lock()
  114. garbagePathMap[path] = true
  115. gpmLock.Unlock()
  116. }
  117. func removeFromGarbagePaths(path string) {
  118. gpmLock.Lock()
  119. delete(garbagePathMap, path)
  120. gpmLock.Unlock()
  121. }
  122. // GC triggers garbage collection of namespace path right away
  123. // and waits for it.
  124. func GC() {
  125. gpmLock.Lock()
  126. if len(garbagePathMap) == 0 {
  127. // No need for GC if map is empty
  128. gpmLock.Unlock()
  129. return
  130. }
  131. gpmLock.Unlock()
  132. // if content exists in the garbage paths
  133. // we can trigger GC to run, providing a
  134. // channel to be notified on completion
  135. waitGC := make(chan struct{})
  136. gpmChan <- waitGC
  137. // wait for GC completion
  138. <-waitGC
  139. }
  140. // GenerateKey generates a sandbox key based on the passed
  141. // container id.
  142. func GenerateKey(containerID string) string {
  143. maxLen := 12
  144. // Read sandbox key from host for overlay
  145. if strings.HasPrefix(containerID, "-") {
  146. var (
  147. index int
  148. indexStr string
  149. tmpkey string
  150. )
  151. dir, err := ioutil.ReadDir(basePath())
  152. if err != nil {
  153. return ""
  154. }
  155. for _, v := range dir {
  156. id := v.Name()
  157. if strings.HasSuffix(id, containerID[:maxLen-1]) {
  158. indexStr = strings.TrimSuffix(id, containerID[:maxLen-1])
  159. tmpindex, err := strconv.Atoi(indexStr)
  160. if err != nil {
  161. return ""
  162. }
  163. if tmpindex > index {
  164. index = tmpindex
  165. tmpkey = id
  166. }
  167. }
  168. }
  169. containerID = tmpkey
  170. if containerID == "" {
  171. return ""
  172. }
  173. }
  174. if len(containerID) < maxLen {
  175. maxLen = len(containerID)
  176. }
  177. return basePath() + "/" + containerID[:maxLen]
  178. }
  179. // NewSandbox provides a new sandbox instance created in an os specific way
  180. // provided a key which uniquely identifies the sandbox
  181. func NewSandbox(key string, osCreate, isRestore bool) (Sandbox, error) {
  182. if !isRestore {
  183. err := createNetworkNamespace(key, osCreate)
  184. if err != nil {
  185. return nil, err
  186. }
  187. } else {
  188. once.Do(createBasePath)
  189. }
  190. n := &networkNamespace{path: key, isDefault: !osCreate, nextIfIndex: make(map[string]int)}
  191. sboxNs, err := netns.GetFromPath(n.path)
  192. if err != nil {
  193. return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
  194. }
  195. defer sboxNs.Close()
  196. n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
  197. if err != nil {
  198. return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
  199. }
  200. err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
  201. if err != nil {
  202. logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
  203. }
  204. // In live-restore mode, IPV6 entries are getting cleaned up due to below code
  205. // We should retain IPV6 configurations in live-restore mode when Docker Daemon
  206. // comes back. It should work as it is on other cases
  207. // As starting point, disable IPv6 on all interfaces
  208. if !isRestore && !n.isDefault {
  209. err = setIPv6(n.path, "all", false)
  210. if err != nil {
  211. logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
  212. }
  213. }
  214. if err = n.loopbackUp(); err != nil {
  215. n.nlHandle.Delete()
  216. return nil, err
  217. }
  218. return n, nil
  219. }
  220. func (n *networkNamespace) InterfaceOptions() IfaceOptionSetter {
  221. return n
  222. }
  223. func (n *networkNamespace) NeighborOptions() NeighborOptionSetter {
  224. return n
  225. }
  226. func mountNetworkNamespace(basePath string, lnPath string) error {
  227. return syscall.Mount(basePath, lnPath, "bind", syscall.MS_BIND, "")
  228. }
  229. // GetSandboxForExternalKey returns sandbox object for the supplied path
  230. func GetSandboxForExternalKey(basePath string, key string) (Sandbox, error) {
  231. if err := createNamespaceFile(key); err != nil {
  232. return nil, err
  233. }
  234. if err := mountNetworkNamespace(basePath, key); err != nil {
  235. return nil, err
  236. }
  237. n := &networkNamespace{path: key, nextIfIndex: make(map[string]int)}
  238. sboxNs, err := netns.GetFromPath(n.path)
  239. if err != nil {
  240. return nil, fmt.Errorf("failed get network namespace %q: %v", n.path, err)
  241. }
  242. defer sboxNs.Close()
  243. n.nlHandle, err = netlink.NewHandleAt(sboxNs, syscall.NETLINK_ROUTE)
  244. if err != nil {
  245. return nil, fmt.Errorf("failed to create a netlink handle: %v", err)
  246. }
  247. err = n.nlHandle.SetSocketTimeout(ns.NetlinkSocketsTimeout)
  248. if err != nil {
  249. logrus.Warnf("Failed to set the timeout on the sandbox netlink handle sockets: %v", err)
  250. }
  251. // As starting point, disable IPv6 on all interfaces
  252. err = setIPv6(n.path, "all", false)
  253. if err != nil {
  254. logrus.Warnf("Failed to disable IPv6 on all interfaces on network namespace %q: %v", n.path, err)
  255. }
  256. if err = n.loopbackUp(); err != nil {
  257. n.nlHandle.Delete()
  258. return nil, err
  259. }
  260. return n, nil
  261. }
  262. func reexecCreateNamespace() {
  263. if len(os.Args) < 2 {
  264. logrus.Fatal("no namespace path provided")
  265. }
  266. if err := mountNetworkNamespace("/proc/self/ns/net", os.Args[1]); err != nil {
  267. logrus.Fatal(err)
  268. }
  269. }
  270. func createNetworkNamespace(path string, osCreate bool) error {
  271. if err := createNamespaceFile(path); err != nil {
  272. return err
  273. }
  274. cmd := &exec.Cmd{
  275. Path: reexec.Self(),
  276. Args: append([]string{"netns-create"}, path),
  277. Stdout: os.Stdout,
  278. Stderr: os.Stderr,
  279. }
  280. if osCreate {
  281. cmd.SysProcAttr = &syscall.SysProcAttr{}
  282. cmd.SysProcAttr.Cloneflags = syscall.CLONE_NEWNET
  283. }
  284. if err := cmd.Run(); err != nil {
  285. return fmt.Errorf("namespace creation reexec command failed: %v", err)
  286. }
  287. return nil
  288. }
  289. func unmountNamespaceFile(path string) {
  290. if _, err := os.Stat(path); err == nil {
  291. syscall.Unmount(path, syscall.MNT_DETACH)
  292. }
  293. }
  294. func createNamespaceFile(path string) (err error) {
  295. var f *os.File
  296. once.Do(createBasePath)
  297. // Remove it from garbage collection list if present
  298. removeFromGarbagePaths(path)
  299. // If the path is there unmount it first
  300. unmountNamespaceFile(path)
  301. // wait for garbage collection to complete if it is in progress
  302. // before trying to create the file.
  303. gpmWg.Wait()
  304. if f, err = os.Create(path); err == nil {
  305. f.Close()
  306. }
  307. return err
  308. }
  309. func (n *networkNamespace) loopbackUp() error {
  310. iface, err := n.nlHandle.LinkByName("lo")
  311. if err != nil {
  312. return err
  313. }
  314. return n.nlHandle.LinkSetUp(iface)
  315. }
  316. func (n *networkNamespace) GetLoopbackIfaceName() string {
  317. return "lo"
  318. }
  319. func (n *networkNamespace) AddAliasIP(ifName string, ip *net.IPNet) error {
  320. iface, err := n.nlHandle.LinkByName(ifName)
  321. if err != nil {
  322. return err
  323. }
  324. return n.nlHandle.AddrAdd(iface, &netlink.Addr{IPNet: ip})
  325. }
  326. func (n *networkNamespace) RemoveAliasIP(ifName string, ip *net.IPNet) error {
  327. iface, err := n.nlHandle.LinkByName(ifName)
  328. if err != nil {
  329. return err
  330. }
  331. return n.nlHandle.AddrDel(iface, &netlink.Addr{IPNet: ip})
  332. }
  333. func (n *networkNamespace) DisableARPForVIP(srcName string) (Err error) {
  334. dstName := ""
  335. for _, i := range n.Interfaces() {
  336. if i.SrcName() == srcName {
  337. dstName = i.DstName()
  338. break
  339. }
  340. }
  341. if dstName == "" {
  342. return fmt.Errorf("failed to find interface %s in sandbox", srcName)
  343. }
  344. err := n.InvokeFunc(func() {
  345. path := filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_ignore")
  346. if err := ioutil.WriteFile(path, []byte{'1', '\n'}, 0644); err != nil {
  347. Err = fmt.Errorf("Failed to set %s to 1: %v", path, err)
  348. return
  349. }
  350. path = filepath.Join("/proc/sys/net/ipv4/conf", dstName, "arp_announce")
  351. if err := ioutil.WriteFile(path, []byte{'2', '\n'}, 0644); err != nil {
  352. Err = fmt.Errorf("Failed to set %s to 2: %v", path, err)
  353. return
  354. }
  355. })
  356. if err != nil {
  357. return err
  358. }
  359. return
  360. }
  361. func (n *networkNamespace) InvokeFunc(f func()) error {
  362. return nsInvoke(n.nsPath(), func(nsFD int) error { return nil }, func(callerFD int) error {
  363. f()
  364. return nil
  365. })
  366. }
  367. // InitOSContext initializes OS context while configuring network resources
  368. func InitOSContext() func() {
  369. runtime.LockOSThread()
  370. if err := ns.SetNamespace(); err != nil {
  371. logrus.Error(err)
  372. }
  373. return runtime.UnlockOSThread
  374. }
  375. func nsInvoke(path string, prefunc func(nsFD int) error, postfunc func(callerFD int) error) error {
  376. defer InitOSContext()()
  377. newNs, err := netns.GetFromPath(path)
  378. if err != nil {
  379. return fmt.Errorf("failed get network namespace %q: %v", path, err)
  380. }
  381. defer newNs.Close()
  382. // Invoked before the namespace switch happens but after the namespace file
  383. // handle is obtained.
  384. if err := prefunc(int(newNs)); err != nil {
  385. return fmt.Errorf("failed in prefunc: %v", err)
  386. }
  387. if err = netns.Set(newNs); err != nil {
  388. return err
  389. }
  390. defer ns.SetNamespace()
  391. // Invoked after the namespace switch.
  392. return postfunc(ns.ParseHandlerInt())
  393. }
  394. func (n *networkNamespace) nsPath() string {
  395. n.Lock()
  396. defer n.Unlock()
  397. return n.path
  398. }
  399. func (n *networkNamespace) Info() Info {
  400. return n
  401. }
  402. func (n *networkNamespace) Key() string {
  403. return n.path
  404. }
  405. func (n *networkNamespace) Destroy() error {
  406. if n.nlHandle != nil {
  407. n.nlHandle.Delete()
  408. }
  409. // Assuming no running process is executing in this network namespace,
  410. // unmounting is sufficient to destroy it.
  411. if err := syscall.Unmount(n.path, syscall.MNT_DETACH); err != nil {
  412. return err
  413. }
  414. // Stash it into the garbage collection list
  415. addToGarbagePaths(n.path)
  416. return nil
  417. }
  418. // Restore restore the network namespace
  419. func (n *networkNamespace) Restore(ifsopt map[string][]IfaceOption, routes []*types.StaticRoute, gw net.IP, gw6 net.IP) error {
  420. // restore interfaces
  421. for name, opts := range ifsopt {
  422. if !strings.Contains(name, "+") {
  423. return fmt.Errorf("wrong iface name in restore osl sandbox interface: %s", name)
  424. }
  425. seps := strings.Split(name, "+")
  426. srcName := seps[0]
  427. dstPrefix := seps[1]
  428. i := &nwIface{srcName: srcName, dstName: dstPrefix, ns: n}
  429. i.processInterfaceOptions(opts...)
  430. if i.master != "" {
  431. i.dstMaster = n.findDst(i.master, true)
  432. if i.dstMaster == "" {
  433. return fmt.Errorf("could not find an appropriate master %q for %q",
  434. i.master, i.srcName)
  435. }
  436. }
  437. if n.isDefault {
  438. i.dstName = i.srcName
  439. } else {
  440. links, err := n.nlHandle.LinkList()
  441. if err != nil {
  442. return fmt.Errorf("failed to retrieve list of links in network namespace %q during restore", n.path)
  443. }
  444. // due to the docker network connect/disconnect, so the dstName should
  445. // restore from the namespace
  446. for _, link := range links {
  447. addrs, err := n.nlHandle.AddrList(link, netlink.FAMILY_V4)
  448. if err != nil {
  449. return err
  450. }
  451. ifaceName := link.Attrs().Name
  452. if strings.HasPrefix(ifaceName, "vxlan") {
  453. if i.dstName == "vxlan" {
  454. i.dstName = ifaceName
  455. break
  456. }
  457. }
  458. // find the interface name by ip
  459. if i.address != nil {
  460. for _, addr := range addrs {
  461. if addr.IPNet.String() == i.address.String() {
  462. i.dstName = ifaceName
  463. break
  464. }
  465. continue
  466. }
  467. if i.dstName == ifaceName {
  468. break
  469. }
  470. }
  471. // This is to find the interface name of the pair in overlay sandbox
  472. if strings.HasPrefix(ifaceName, "veth") {
  473. if i.master != "" && i.dstName == "veth" {
  474. i.dstName = ifaceName
  475. }
  476. }
  477. }
  478. var index int
  479. indexStr := strings.TrimPrefix(i.dstName, dstPrefix)
  480. if indexStr != "" {
  481. index, err = strconv.Atoi(indexStr)
  482. if err != nil {
  483. return err
  484. }
  485. }
  486. index++
  487. n.Lock()
  488. if index > n.nextIfIndex[dstPrefix] {
  489. n.nextIfIndex[dstPrefix] = index
  490. }
  491. n.iFaces = append(n.iFaces, i)
  492. n.Unlock()
  493. }
  494. }
  495. // restore routes
  496. for _, r := range routes {
  497. n.Lock()
  498. n.staticRoutes = append(n.staticRoutes, r)
  499. n.Unlock()
  500. }
  501. // restore gateway
  502. if len(gw) > 0 {
  503. n.Lock()
  504. n.gw = gw
  505. n.Unlock()
  506. }
  507. if len(gw6) > 0 {
  508. n.Lock()
  509. n.gwv6 = gw6
  510. n.Unlock()
  511. }
  512. return nil
  513. }
  514. // Checks whether IPv6 needs to be enabled/disabled on the loopback interface
  515. func (n *networkNamespace) checkLoV6() {
  516. var (
  517. enable = false
  518. action = "disable"
  519. )
  520. n.Lock()
  521. for _, iface := range n.iFaces {
  522. if iface.AddressIPv6() != nil {
  523. enable = true
  524. action = "enable"
  525. break
  526. }
  527. }
  528. n.Unlock()
  529. if n.loV6Enabled == enable {
  530. return
  531. }
  532. if err := setIPv6(n.path, "lo", enable); err != nil {
  533. logrus.Warnf("Failed to %s IPv6 on loopback interface on network namespace %q: %v", action, n.path, err)
  534. }
  535. n.loV6Enabled = enable
  536. }
  537. func reexecSetIPv6() {
  538. runtime.LockOSThread()
  539. defer runtime.UnlockOSThread()
  540. if len(os.Args) < 3 {
  541. logrus.Errorf("invalid number of arguments for %s", os.Args[0])
  542. os.Exit(1)
  543. }
  544. ns, err := netns.GetFromPath(os.Args[1])
  545. if err != nil {
  546. logrus.Errorf("failed get network namespace %q: %v", os.Args[1], err)
  547. os.Exit(2)
  548. }
  549. defer ns.Close()
  550. if err = netns.Set(ns); err != nil {
  551. logrus.Errorf("setting into container netns %q failed: %v", os.Args[1], err)
  552. os.Exit(3)
  553. }
  554. var (
  555. action = "disable"
  556. value = byte('1')
  557. path = fmt.Sprintf("/proc/sys/net/ipv6/conf/%s/disable_ipv6", os.Args[2])
  558. )
  559. if os.Args[3] == "true" {
  560. action = "enable"
  561. value = byte('0')
  562. }
  563. if _, err := os.Stat(path); err != nil {
  564. if os.IsNotExist(err) {
  565. logrus.Warnf("file does not exist: %s : %v Has IPv6 been disabled in this node's kernel?", path, err)
  566. os.Exit(0)
  567. }
  568. logrus.Errorf("failed to stat %s : %v", path, err)
  569. os.Exit(5)
  570. }
  571. if err = ioutil.WriteFile(path, []byte{value, '\n'}, 0644); err != nil {
  572. logrus.Errorf("failed to %s IPv6 forwarding for container's interface %s: %v", action, os.Args[2], err)
  573. os.Exit(4)
  574. }
  575. os.Exit(0)
  576. }
  577. func setIPv6(path, iface string, enable bool) error {
  578. cmd := &exec.Cmd{
  579. Path: reexec.Self(),
  580. Args: append([]string{"set-ipv6"}, path, iface, strconv.FormatBool(enable)),
  581. Stdout: os.Stdout,
  582. Stderr: os.Stderr,
  583. }
  584. if err := cmd.Run(); err != nil {
  585. return fmt.Errorf("reexec to set IPv6 failed: %v", err)
  586. }
  587. return nil
  588. }
  589. // ApplyOSTweaks applies linux configs on the sandbox
  590. func (n *networkNamespace) ApplyOSTweaks(types []SandboxType) {
  591. for _, t := range types {
  592. switch t {
  593. case SandboxTypeLoadBalancer:
  594. kernel.ApplyOSTweaks(loadBalancerConfig)
  595. }
  596. }
  597. }