ov_network.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629
  1. //go:build linux
  2. package overlay
  3. import (
  4. "context"
  5. "errors"
  6. "fmt"
  7. "net"
  8. "os"
  9. "path/filepath"
  10. "runtime"
  11. "strconv"
  12. "strings"
  13. "sync"
  14. "github.com/containerd/log"
  15. "github.com/docker/docker/libnetwork/driverapi"
  16. "github.com/docker/docker/libnetwork/drivers/overlay/overlayutils"
  17. "github.com/docker/docker/libnetwork/netlabel"
  18. "github.com/docker/docker/libnetwork/ns"
  19. "github.com/docker/docker/libnetwork/osl"
  20. "github.com/docker/docker/libnetwork/types"
  21. "github.com/hashicorp/go-multierror"
  22. "github.com/vishvananda/netlink"
  23. "github.com/vishvananda/netns"
  24. "golang.org/x/sys/unix"
  25. )
  26. var (
  27. networkOnce sync.Once
  28. networkMu sync.Mutex
  29. vniTbl = make(map[uint32]string)
  30. )
  31. type networkTable map[string]*network
  32. type subnet struct {
  33. sboxInit bool
  34. vxlanName string
  35. brName string
  36. vni uint32
  37. initErr error
  38. subnetIP *net.IPNet
  39. gwIP *net.IPNet
  40. }
  41. type network struct {
  42. id string
  43. sbox *osl.Namespace
  44. endpoints endpointTable
  45. driver *driver
  46. joinCnt int
  47. sboxInit bool
  48. initEpoch int
  49. initErr error
  50. subnets []*subnet
  51. secure bool
  52. mtu int
  53. sync.Mutex
  54. }
  55. func init() {
  56. // Lock main() to the initial thread to exclude the goroutines executing
  57. // func setDefaultVLAN() from being scheduled onto that thread. Changes to
  58. // the network namespace of the initial thread alter /proc/self/ns/net,
  59. // which would break any code which (incorrectly) assumes that that file is
  60. // a handle to the network namespace for the thread it is currently
  61. // executing on.
  62. runtime.LockOSThread()
  63. }
  64. func (d *driver) NetworkAllocate(id string, option map[string]string, ipV4Data, ipV6Data []driverapi.IPAMData) (map[string]string, error) {
  65. return nil, types.NotImplementedErrorf("not implemented")
  66. }
  67. func (d *driver) NetworkFree(id string) error {
  68. return types.NotImplementedErrorf("not implemented")
  69. }
  70. func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo driverapi.NetworkInfo, ipV4Data, ipV6Data []driverapi.IPAMData) error {
  71. if id == "" {
  72. return fmt.Errorf("invalid network id")
  73. }
  74. if len(ipV4Data) == 0 || ipV4Data[0].Pool.String() == "0.0.0.0/0" {
  75. return types.InvalidParameterErrorf("ipv4 pool is empty")
  76. }
  77. // Since we perform lazy configuration make sure we try
  78. // configuring the driver when we enter CreateNetwork
  79. if err := d.configure(); err != nil {
  80. return err
  81. }
  82. n := &network{
  83. id: id,
  84. driver: d,
  85. endpoints: endpointTable{},
  86. subnets: []*subnet{},
  87. }
  88. vnis := make([]uint32, 0, len(ipV4Data))
  89. gval, ok := option[netlabel.GenericData]
  90. if !ok {
  91. return fmt.Errorf("option %s is missing", netlabel.GenericData)
  92. }
  93. optMap := gval.(map[string]string)
  94. vnisOpt, ok := optMap[netlabel.OverlayVxlanIDList]
  95. if !ok {
  96. return errors.New("no VNI provided")
  97. }
  98. log.G(context.TODO()).Debugf("overlay: Received vxlan IDs: %s", vnisOpt)
  99. var err error
  100. vnis, err = overlayutils.AppendVNIList(vnis, vnisOpt)
  101. if err != nil {
  102. return err
  103. }
  104. if _, ok := optMap[secureOption]; ok {
  105. n.secure = true
  106. }
  107. if val, ok := optMap[netlabel.DriverMTU]; ok {
  108. var err error
  109. if n.mtu, err = strconv.Atoi(val); err != nil {
  110. return fmt.Errorf("failed to parse %v: %v", val, err)
  111. }
  112. if n.mtu < 0 {
  113. return fmt.Errorf("invalid MTU value: %v", n.mtu)
  114. }
  115. }
  116. if len(vnis) == 0 {
  117. return errors.New("no VNI provided")
  118. } else if len(vnis) < len(ipV4Data) {
  119. return fmt.Errorf("insufficient vnis(%d) passed to overlay", len(vnis))
  120. }
  121. for i, ipd := range ipV4Data {
  122. s := &subnet{
  123. subnetIP: ipd.Pool,
  124. gwIP: ipd.Gateway,
  125. vni: vnis[i],
  126. }
  127. n.subnets = append(n.subnets, s)
  128. }
  129. d.Lock()
  130. defer d.Unlock()
  131. if d.networks[n.id] != nil {
  132. return fmt.Errorf("attempt to create overlay network %v that already exists", n.id)
  133. }
  134. // Make sure no rule is on the way from any stale secure network
  135. if !n.secure {
  136. for _, vni := range vnis {
  137. d.programMangle(vni, false)
  138. d.programInput(vni, false)
  139. }
  140. }
  141. if nInfo != nil {
  142. if err := nInfo.TableEventRegister(ovPeerTable, driverapi.EndpointObject); err != nil {
  143. // XXX Undo writeToStore? No method to so. Why?
  144. return err
  145. }
  146. }
  147. d.networks[id] = n
  148. return nil
  149. }
  150. func (d *driver) DeleteNetwork(nid string) error {
  151. if nid == "" {
  152. return fmt.Errorf("invalid network id")
  153. }
  154. // Make sure driver resources are initialized before proceeding
  155. if err := d.configure(); err != nil {
  156. return err
  157. }
  158. d.Lock()
  159. // Only perform a peer flush operation (if required) AFTER unlocking
  160. // the driver lock to avoid deadlocking w/ the peerDB.
  161. var doPeerFlush bool
  162. defer func() {
  163. d.Unlock()
  164. if doPeerFlush {
  165. d.peerFlush(nid)
  166. }
  167. }()
  168. // This is similar to d.network(), but we need to keep holding the lock
  169. // until we are done removing this network.
  170. n := d.networks[nid]
  171. if n == nil {
  172. return fmt.Errorf("could not find network with id %s", nid)
  173. }
  174. for _, ep := range n.endpoints {
  175. if ep.ifName != "" {
  176. if link, err := ns.NlHandle().LinkByName(ep.ifName); err == nil {
  177. if err := ns.NlHandle().LinkDel(link); err != nil {
  178. log.G(context.TODO()).WithError(err).Warnf("Failed to delete interface (%s)'s link on endpoint (%s) delete", ep.ifName, ep.id)
  179. }
  180. }
  181. }
  182. }
  183. doPeerFlush = true
  184. delete(d.networks, nid)
  185. if n.secure {
  186. for _, s := range n.subnets {
  187. if err := d.programMangle(s.vni, false); err != nil {
  188. log.G(context.TODO()).WithFields(log.Fields{
  189. "error": err,
  190. "network_id": n.id,
  191. "subnet": s.subnetIP,
  192. }).Warn("Failed to clean up iptables rules during overlay network deletion")
  193. }
  194. if err := d.programInput(s.vni, false); err != nil {
  195. log.G(context.TODO()).WithFields(log.Fields{
  196. "error": err,
  197. "network_id": n.id,
  198. "subnet": s.subnetIP,
  199. }).Warn("Failed to clean up iptables rules during overlay network deletion")
  200. }
  201. }
  202. }
  203. return nil
  204. }
  205. func (d *driver) ProgramExternalConnectivity(nid, eid string, options map[string]interface{}) error {
  206. return nil
  207. }
  208. func (d *driver) RevokeExternalConnectivity(nid, eid string) error {
  209. return nil
  210. }
  211. func (n *network) joinSandbox(s *subnet, incJoinCount bool) error {
  212. // If there is a race between two go routines here only one will win
  213. // the other will wait.
  214. networkOnce.Do(populateVNITbl)
  215. n.Lock()
  216. // If initialization was successful then tell the peerDB to initialize the
  217. // sandbox with all the peers previously received from networkdb. But only
  218. // do this after unlocking the network. Otherwise we could deadlock with
  219. // on the peerDB channel while peerDB is waiting for the network lock.
  220. var doInitPeerDB bool
  221. defer func() {
  222. n.Unlock()
  223. if doInitPeerDB {
  224. go n.driver.initSandboxPeerDB(n.id)
  225. }
  226. }()
  227. if !n.sboxInit {
  228. n.initErr = n.initSandbox()
  229. doInitPeerDB = n.initErr == nil
  230. // If there was an error, we cannot recover it
  231. n.sboxInit = true
  232. }
  233. if n.initErr != nil {
  234. return fmt.Errorf("network sandbox join failed: %v", n.initErr)
  235. }
  236. subnetErr := s.initErr
  237. if !s.sboxInit {
  238. subnetErr = n.initSubnetSandbox(s)
  239. // We can recover from these errors
  240. if subnetErr == nil {
  241. s.initErr = subnetErr
  242. s.sboxInit = true
  243. }
  244. }
  245. if subnetErr != nil {
  246. return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), subnetErr)
  247. }
  248. if incJoinCount {
  249. n.joinCnt++
  250. }
  251. return nil
  252. }
  253. func (n *network) leaveSandbox() {
  254. n.Lock()
  255. defer n.Unlock()
  256. n.joinCnt--
  257. if n.joinCnt != 0 {
  258. return
  259. }
  260. n.destroySandbox()
  261. n.sboxInit = false
  262. n.initErr = nil
  263. for _, s := range n.subnets {
  264. s.sboxInit = false
  265. s.initErr = nil
  266. }
  267. }
  268. // to be called while holding network lock
  269. func (n *network) destroySandbox() {
  270. if n.sbox != nil {
  271. for _, iface := range n.sbox.Interfaces() {
  272. if err := iface.Remove(); err != nil {
  273. log.G(context.TODO()).Debugf("Remove interface %s failed: %v", iface.SrcName(), err)
  274. }
  275. }
  276. for _, s := range n.subnets {
  277. if s.vxlanName != "" {
  278. err := deleteInterface(s.vxlanName)
  279. if err != nil {
  280. log.G(context.TODO()).Warnf("could not cleanup sandbox properly: %v", err)
  281. }
  282. }
  283. }
  284. n.sbox.Destroy()
  285. n.sbox = nil
  286. }
  287. }
  288. func populateVNITbl() {
  289. filepath.WalkDir(filepath.Dir(osl.GenerateKey("walk")),
  290. // NOTE(cpuguy83): The linter picked up on the fact that this walk function was not using this error argument
  291. // That seems wrong... however I'm not familiar with this code or if that error matters
  292. func(path string, _ os.DirEntry, _ error) error {
  293. _, fname := filepath.Split(path)
  294. if len(strings.Split(fname, "-")) <= 1 {
  295. return nil
  296. }
  297. n, err := netns.GetFromPath(path)
  298. if err != nil {
  299. log.G(context.TODO()).Errorf("Could not open namespace path %s during vni population: %v", path, err)
  300. return nil
  301. }
  302. defer n.Close()
  303. nlh, err := netlink.NewHandleAt(n, unix.NETLINK_ROUTE)
  304. if err != nil {
  305. log.G(context.TODO()).Errorf("Could not open netlink handle during vni population for ns %s: %v", path, err)
  306. return nil
  307. }
  308. defer nlh.Close()
  309. err = nlh.SetSocketTimeout(soTimeout)
  310. if err != nil {
  311. log.G(context.TODO()).Warnf("Failed to set the timeout on the netlink handle sockets for vni table population: %v", err)
  312. }
  313. links, err := nlh.LinkList()
  314. if err != nil {
  315. log.G(context.TODO()).Errorf("Failed to list interfaces during vni population for ns %s: %v", path, err)
  316. return nil
  317. }
  318. for _, l := range links {
  319. if l.Type() == "vxlan" {
  320. vniTbl[uint32(l.(*netlink.Vxlan).VxlanId)] = path
  321. }
  322. }
  323. return nil
  324. })
  325. }
  326. func (n *network) generateVxlanName(s *subnet) string {
  327. id := n.id
  328. if len(n.id) > 5 {
  329. id = n.id[:5]
  330. }
  331. return fmt.Sprintf("vx-%06x-%v", s.vni, id)
  332. }
  333. func (n *network) generateBridgeName(s *subnet) string {
  334. id := n.id
  335. if len(n.id) > 5 {
  336. id = n.id[:5]
  337. }
  338. return n.getBridgeNamePrefix(s) + "-" + id
  339. }
  340. func (n *network) getBridgeNamePrefix(s *subnet) string {
  341. return fmt.Sprintf("ov-%06x", s.vni)
  342. }
  343. func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error {
  344. // Try to find this subnet's vni is being used in some
  345. // other namespace by looking at vniTbl that we just
  346. // populated in the once init. If a hit is found then
  347. // it must a stale namespace from previous
  348. // life. Destroy it completely and reclaim resourced.
  349. networkMu.Lock()
  350. path, ok := vniTbl[s.vni]
  351. networkMu.Unlock()
  352. if ok {
  353. deleteVxlanByVNI(path, s.vni)
  354. if err := unix.Unmount(path, unix.MNT_FORCE); err != nil {
  355. log.G(context.TODO()).Errorf("unmount of %s failed: %v", path, err)
  356. }
  357. os.Remove(path)
  358. networkMu.Lock()
  359. delete(vniTbl, s.vni)
  360. networkMu.Unlock()
  361. }
  362. // create a bridge and vxlan device for this subnet and move it to the sandbox
  363. sbox := n.sbox
  364. if err := sbox.AddInterface(brName, "br", osl.WithIPv4Address(s.gwIP), osl.WithIsBridge(true)); err != nil {
  365. return fmt.Errorf("bridge creation in sandbox failed for subnet %q: %v", s.subnetIP.String(), err)
  366. }
  367. v6transport, err := n.driver.isIPv6Transport()
  368. if err != nil {
  369. log.G(context.TODO()).WithError(err).Errorf("Assuming IPv4 transport; overlay network %s will not pass traffic if the Swarm data plane is IPv6.", n.id)
  370. }
  371. if err := createVxlan(vxlanName, s.vni, n.maxMTU(), v6transport); err != nil {
  372. return err
  373. }
  374. if err := sbox.AddInterface(vxlanName, "vxlan", osl.WithMaster(brName)); err != nil {
  375. // If adding vxlan device to the overlay namespace fails, remove the bridge interface we
  376. // already added to the namespace. This allows the caller to try the setup again.
  377. for _, iface := range sbox.Interfaces() {
  378. if iface.SrcName() == brName {
  379. if ierr := iface.Remove(); ierr != nil {
  380. log.G(context.TODO()).Errorf("removing bridge failed from ov ns %v failed, %v", n.sbox.Key(), ierr)
  381. }
  382. }
  383. }
  384. // Also, delete the vxlan interface. Since a global vni id is associated
  385. // with the vxlan interface, an orphaned vxlan interface will result in
  386. // failure of vxlan device creation if the vni is assigned to some other
  387. // network.
  388. if deleteErr := deleteInterface(vxlanName); deleteErr != nil {
  389. log.G(context.TODO()).Warnf("could not delete vxlan interface, %s, error %v, after config error, %v", vxlanName, deleteErr, err)
  390. }
  391. return fmt.Errorf("vxlan interface creation failed for subnet %q: %v", s.subnetIP.String(), err)
  392. }
  393. if err := setDefaultVLAN(sbox); err != nil {
  394. // not a fatal error
  395. log.G(context.TODO()).WithError(err).Error("set bridge default vlan failed")
  396. }
  397. return nil
  398. }
  399. func setDefaultVLAN(ns *osl.Namespace) error {
  400. var brName string
  401. for _, i := range ns.Interfaces() {
  402. if i.Bridge() {
  403. brName = i.DstName()
  404. }
  405. }
  406. // IFLA_BR_VLAN_DEFAULT_PVID was added in Linux v4.4 (see torvalds/linux@0f963b7), so we can't use netlink for
  407. // setting this until Docker drops support for CentOS/RHEL 7 (kernel 3.10, eol date: 2024-06-30).
  408. var innerErr error
  409. err := ns.InvokeFunc(func() {
  410. // Contrary to what the sysfs(5) man page says, the entries of /sys/class/net
  411. // represent the networking devices visible in the network namespace of the
  412. // process which mounted the sysfs filesystem, irrespective of the network
  413. // namespace of the process accessing the directory. Remount sysfs in order to
  414. // see the network devices in sbox's network namespace, making sure the mount
  415. // doesn't propagate back.
  416. //
  417. // The Linux implementation of (osl.Sandbox).InvokeFunc() runs the function in a
  418. // dedicated goroutine. The effects of unshare(CLONE_NEWNS) on a thread cannot
  419. // be reverted so the thread needs to be terminated once the goroutine is
  420. // finished.
  421. runtime.LockOSThread()
  422. if err := unix.Unshare(unix.CLONE_NEWNS); err != nil {
  423. innerErr = os.NewSyscallError("unshare", err)
  424. return
  425. }
  426. if err := unix.Mount("", "/", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
  427. innerErr = &os.PathError{Op: "mount", Path: "/", Err: err}
  428. return
  429. }
  430. if err := unix.Mount("sysfs", "/sys", "sysfs", 0, ""); err != nil {
  431. innerErr = &os.PathError{Op: "mount", Path: "/sys", Err: err}
  432. return
  433. }
  434. path := filepath.Join("/sys/class/net", brName, "bridge/default_pvid")
  435. data := []byte{'0', '\n'}
  436. if err := os.WriteFile(path, data, 0o644); err != nil {
  437. innerErr = fmt.Errorf("failed to enable default vlan on bridge %s: %w", brName, err)
  438. return
  439. }
  440. })
  441. if err != nil {
  442. return err
  443. }
  444. return innerErr
  445. }
  446. // Must be called with the network lock
  447. func (n *network) initSubnetSandbox(s *subnet) error {
  448. brName := n.generateBridgeName(s)
  449. vxlanName := n.generateVxlanName(s)
  450. // Program iptables rules for mandatory encryption of the secure
  451. // network, or clean up leftover rules for a stale secure network which
  452. // was previously assigned the same VNI.
  453. if err := n.driver.programMangle(s.vni, n.secure); err != nil {
  454. return err
  455. }
  456. if err := n.driver.programInput(s.vni, n.secure); err != nil {
  457. if n.secure {
  458. return multierror.Append(err, n.driver.programMangle(s.vni, false))
  459. }
  460. }
  461. if err := n.setupSubnetSandbox(s, brName, vxlanName); err != nil {
  462. return err
  463. }
  464. s.vxlanName = vxlanName
  465. s.brName = brName
  466. return nil
  467. }
  468. func (n *network) cleanupStaleSandboxes() {
  469. filepath.WalkDir(filepath.Dir(osl.GenerateKey("walk")),
  470. func(path string, _ os.DirEntry, _ error) error {
  471. _, fname := filepath.Split(path)
  472. pList := strings.Split(fname, "-")
  473. if len(pList) <= 1 {
  474. return nil
  475. }
  476. pattern := pList[1]
  477. if strings.Contains(n.id, pattern) {
  478. // Delete all vnis
  479. deleteVxlanByVNI(path, 0)
  480. unix.Unmount(path, unix.MNT_DETACH)
  481. os.Remove(path)
  482. // Now that we have destroyed this
  483. // sandbox, remove all references to
  484. // it in vniTbl so that we don't
  485. // inadvertently destroy the sandbox
  486. // created in this life.
  487. networkMu.Lock()
  488. for vni, tblPath := range vniTbl {
  489. if tblPath == path {
  490. delete(vniTbl, vni)
  491. }
  492. }
  493. networkMu.Unlock()
  494. }
  495. return nil
  496. })
  497. }
  498. func (n *network) initSandbox() error {
  499. n.initEpoch++
  500. // If there are any stale sandboxes related to this network
  501. // from previous daemon life clean it up here
  502. n.cleanupStaleSandboxes()
  503. key := osl.GenerateKey(fmt.Sprintf("%d-", n.initEpoch) + n.id)
  504. sbox, err := osl.NewSandbox(key, true, false)
  505. if err != nil {
  506. return fmt.Errorf("could not get network sandbox: %v", err)
  507. }
  508. // this is needed to let the peerAdd configure the sandbox
  509. n.sbox = sbox
  510. return nil
  511. }
  512. func (d *driver) network(nid string) *network {
  513. d.Lock()
  514. n := d.networks[nid]
  515. d.Unlock()
  516. return n
  517. }
  518. func (n *network) sandbox() *osl.Namespace {
  519. n.Lock()
  520. defer n.Unlock()
  521. return n.sbox
  522. }
  523. // getSubnetforIP returns the subnet to which the given IP belongs
  524. func (n *network) getSubnetforIP(ip *net.IPNet) *subnet {
  525. for _, s := range n.subnets {
  526. // first check if the mask lengths are the same
  527. i, _ := s.subnetIP.Mask.Size()
  528. j, _ := ip.Mask.Size()
  529. if i != j {
  530. continue
  531. }
  532. if s.subnetIP.Contains(ip.IP) {
  533. return s
  534. }
  535. }
  536. return nil
  537. }