ov_network.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626
  1. //go:build linux
  2. package overlay
  3. import (
  4. "context"
  5. "errors"
  6. "fmt"
  7. "net"
  8. "os"
  9. "path/filepath"
  10. "runtime"
  11. "strconv"
  12. "strings"
  13. "sync"
  14. "github.com/containerd/containerd/log"
  15. "github.com/docker/docker/libnetwork/driverapi"
  16. "github.com/docker/docker/libnetwork/drivers/overlay/overlayutils"
  17. "github.com/docker/docker/libnetwork/netlabel"
  18. "github.com/docker/docker/libnetwork/ns"
  19. "github.com/docker/docker/libnetwork/osl"
  20. "github.com/docker/docker/libnetwork/types"
  21. "github.com/hashicorp/go-multierror"
  22. "github.com/vishvananda/netlink"
  23. "github.com/vishvananda/netns"
  24. "golang.org/x/sys/unix"
  25. )
  26. var (
  27. networkOnce sync.Once
  28. networkMu sync.Mutex
  29. vniTbl = make(map[uint32]string)
  30. )
  31. type networkTable map[string]*network
  32. type subnet struct {
  33. sboxInit bool
  34. vxlanName string
  35. brName string
  36. vni uint32
  37. initErr error
  38. subnetIP *net.IPNet
  39. gwIP *net.IPNet
  40. }
  41. type network struct {
  42. id string
  43. sbox *osl.Namespace
  44. endpoints endpointTable
  45. driver *driver
  46. joinCnt int
  47. sboxInit bool
  48. initEpoch int
  49. initErr error
  50. subnets []*subnet
  51. secure bool
  52. mtu int
  53. sync.Mutex
  54. }
  55. func init() {
  56. // Lock main() to the initial thread to exclude the goroutines executing
  57. // func setDefaultVLAN() from being scheduled onto that thread. Changes to
  58. // the network namespace of the initial thread alter /proc/self/ns/net,
  59. // which would break any code which (incorrectly) assumes that that file is
  60. // a handle to the network namespace for the thread it is currently
  61. // executing on.
  62. runtime.LockOSThread()
  63. }
  64. func (d *driver) NetworkAllocate(id string, option map[string]string, ipV4Data, ipV6Data []driverapi.IPAMData) (map[string]string, error) {
  65. return nil, types.NotImplementedErrorf("not implemented")
  66. }
  67. func (d *driver) NetworkFree(id string) error {
  68. return types.NotImplementedErrorf("not implemented")
  69. }
  70. func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo driverapi.NetworkInfo, ipV4Data, ipV6Data []driverapi.IPAMData) error {
  71. if id == "" {
  72. return fmt.Errorf("invalid network id")
  73. }
  74. if len(ipV4Data) == 0 || ipV4Data[0].Pool.String() == "0.0.0.0/0" {
  75. return types.InvalidParameterErrorf("ipv4 pool is empty")
  76. }
  77. // Since we perform lazy configuration make sure we try
  78. // configuring the driver when we enter CreateNetwork
  79. if err := d.configure(); err != nil {
  80. return err
  81. }
  82. n := &network{
  83. id: id,
  84. driver: d,
  85. endpoints: endpointTable{},
  86. subnets: []*subnet{},
  87. }
  88. vnis := make([]uint32, 0, len(ipV4Data))
  89. gval, ok := option[netlabel.GenericData]
  90. if !ok {
  91. return fmt.Errorf("option %s is missing", netlabel.GenericData)
  92. }
  93. optMap := gval.(map[string]string)
  94. vnisOpt, ok := optMap[netlabel.OverlayVxlanIDList]
  95. if !ok {
  96. return errors.New("no VNI provided")
  97. }
  98. log.G(context.TODO()).Debugf("overlay: Received vxlan IDs: %s", vnisOpt)
  99. var err error
  100. vnis, err = overlayutils.AppendVNIList(vnis, vnisOpt)
  101. if err != nil {
  102. return err
  103. }
  104. if _, ok := optMap[secureOption]; ok {
  105. n.secure = true
  106. }
  107. if val, ok := optMap[netlabel.DriverMTU]; ok {
  108. var err error
  109. if n.mtu, err = strconv.Atoi(val); err != nil {
  110. return fmt.Errorf("failed to parse %v: %v", val, err)
  111. }
  112. if n.mtu < 0 {
  113. return fmt.Errorf("invalid MTU value: %v", n.mtu)
  114. }
  115. }
  116. if len(vnis) == 0 {
  117. return errors.New("no VNI provided")
  118. } else if len(vnis) < len(ipV4Data) {
  119. return fmt.Errorf("insufficient vnis(%d) passed to overlay", len(vnis))
  120. }
  121. for i, ipd := range ipV4Data {
  122. s := &subnet{
  123. subnetIP: ipd.Pool,
  124. gwIP: ipd.Gateway,
  125. vni: vnis[i],
  126. }
  127. n.subnets = append(n.subnets, s)
  128. }
  129. d.Lock()
  130. defer d.Unlock()
  131. if d.networks[n.id] != nil {
  132. return fmt.Errorf("attempt to create overlay network %v that already exists", n.id)
  133. }
  134. // Make sure no rule is on the way from any stale secure network
  135. if !n.secure {
  136. for _, vni := range vnis {
  137. programMangle(vni, false)
  138. programInput(vni, false)
  139. }
  140. }
  141. if nInfo != nil {
  142. if err := nInfo.TableEventRegister(ovPeerTable, driverapi.EndpointObject); err != nil {
  143. // XXX Undo writeToStore? No method to so. Why?
  144. return err
  145. }
  146. }
  147. d.networks[id] = n
  148. return nil
  149. }
  150. func (d *driver) DeleteNetwork(nid string) error {
  151. if nid == "" {
  152. return fmt.Errorf("invalid network id")
  153. }
  154. // Make sure driver resources are initialized before proceeding
  155. if err := d.configure(); err != nil {
  156. return err
  157. }
  158. d.Lock()
  159. // Only perform a peer flush operation (if required) AFTER unlocking
  160. // the driver lock to avoid deadlocking w/ the peerDB.
  161. var doPeerFlush bool
  162. defer func() {
  163. d.Unlock()
  164. if doPeerFlush {
  165. d.peerFlush(nid)
  166. }
  167. }()
  168. // This is similar to d.network(), but we need to keep holding the lock
  169. // until we are done removing this network.
  170. n := d.networks[nid]
  171. if n == nil {
  172. return fmt.Errorf("could not find network with id %s", nid)
  173. }
  174. for _, ep := range n.endpoints {
  175. if ep.ifName != "" {
  176. if link, err := ns.NlHandle().LinkByName(ep.ifName); err == nil {
  177. if err := ns.NlHandle().LinkDel(link); err != nil {
  178. log.G(context.TODO()).WithError(err).Warnf("Failed to delete interface (%s)'s link on endpoint (%s) delete", ep.ifName, ep.id)
  179. }
  180. }
  181. }
  182. }
  183. doPeerFlush = true
  184. delete(d.networks, nid)
  185. if n.secure {
  186. for _, s := range n.subnets {
  187. if err := programMangle(s.vni, false); err != nil {
  188. log.G(context.TODO()).WithFields(log.Fields{
  189. "error": err,
  190. "network_id": n.id,
  191. "subnet": s.subnetIP,
  192. }).Warn("Failed to clean up iptables rules during overlay network deletion")
  193. }
  194. if err := programInput(s.vni, false); err != nil {
  195. log.G(context.TODO()).WithFields(log.Fields{
  196. "error": err,
  197. "network_id": n.id,
  198. "subnet": s.subnetIP,
  199. }).Warn("Failed to clean up iptables rules during overlay network deletion")
  200. }
  201. }
  202. }
  203. return nil
  204. }
  205. func (d *driver) ProgramExternalConnectivity(nid, eid string, options map[string]interface{}) error {
  206. return nil
  207. }
  208. func (d *driver) RevokeExternalConnectivity(nid, eid string) error {
  209. return nil
  210. }
  211. func (n *network) joinSandbox(s *subnet, incJoinCount bool) error {
  212. // If there is a race between two go routines here only one will win
  213. // the other will wait.
  214. networkOnce.Do(populateVNITbl)
  215. n.Lock()
  216. // If initialization was successful then tell the peerDB to initialize the
  217. // sandbox with all the peers previously received from networkdb. But only
  218. // do this after unlocking the network. Otherwise we could deadlock with
  219. // on the peerDB channel while peerDB is waiting for the network lock.
  220. var doInitPeerDB bool
  221. defer func() {
  222. n.Unlock()
  223. if doInitPeerDB {
  224. go n.driver.initSandboxPeerDB(n.id)
  225. }
  226. }()
  227. if !n.sboxInit {
  228. n.initErr = n.initSandbox()
  229. doInitPeerDB = n.initErr == nil
  230. // If there was an error, we cannot recover it
  231. n.sboxInit = true
  232. }
  233. if n.initErr != nil {
  234. return fmt.Errorf("network sandbox join failed: %v", n.initErr)
  235. }
  236. subnetErr := s.initErr
  237. if !s.sboxInit {
  238. subnetErr = n.initSubnetSandbox(s)
  239. // We can recover from these errors
  240. if subnetErr == nil {
  241. s.initErr = subnetErr
  242. s.sboxInit = true
  243. }
  244. }
  245. if subnetErr != nil {
  246. return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), subnetErr)
  247. }
  248. if incJoinCount {
  249. n.joinCnt++
  250. }
  251. return nil
  252. }
  253. func (n *network) leaveSandbox() {
  254. n.Lock()
  255. defer n.Unlock()
  256. n.joinCnt--
  257. if n.joinCnt != 0 {
  258. return
  259. }
  260. n.destroySandbox()
  261. n.sboxInit = false
  262. n.initErr = nil
  263. for _, s := range n.subnets {
  264. s.sboxInit = false
  265. s.initErr = nil
  266. }
  267. }
  268. // to be called while holding network lock
  269. func (n *network) destroySandbox() {
  270. if n.sbox != nil {
  271. for _, iface := range n.sbox.Interfaces() {
  272. if err := iface.Remove(); err != nil {
  273. log.G(context.TODO()).Debugf("Remove interface %s failed: %v", iface.SrcName(), err)
  274. }
  275. }
  276. for _, s := range n.subnets {
  277. if s.vxlanName != "" {
  278. err := deleteInterface(s.vxlanName)
  279. if err != nil {
  280. log.G(context.TODO()).Warnf("could not cleanup sandbox properly: %v", err)
  281. }
  282. }
  283. }
  284. n.sbox.Destroy()
  285. n.sbox = nil
  286. }
  287. }
  288. func populateVNITbl() {
  289. filepath.WalkDir(filepath.Dir(osl.GenerateKey("walk")),
  290. // NOTE(cpuguy83): The linter picked up on the fact that this walk function was not using this error argument
  291. // That seems wrong... however I'm not familiar with this code or if that error matters
  292. func(path string, _ os.DirEntry, _ error) error {
  293. _, fname := filepath.Split(path)
  294. if len(strings.Split(fname, "-")) <= 1 {
  295. return nil
  296. }
  297. n, err := netns.GetFromPath(path)
  298. if err != nil {
  299. log.G(context.TODO()).Errorf("Could not open namespace path %s during vni population: %v", path, err)
  300. return nil
  301. }
  302. defer n.Close()
  303. nlh, err := netlink.NewHandleAt(n, unix.NETLINK_ROUTE)
  304. if err != nil {
  305. log.G(context.TODO()).Errorf("Could not open netlink handle during vni population for ns %s: %v", path, err)
  306. return nil
  307. }
  308. defer nlh.Close()
  309. err = nlh.SetSocketTimeout(soTimeout)
  310. if err != nil {
  311. log.G(context.TODO()).Warnf("Failed to set the timeout on the netlink handle sockets for vni table population: %v", err)
  312. }
  313. links, err := nlh.LinkList()
  314. if err != nil {
  315. log.G(context.TODO()).Errorf("Failed to list interfaces during vni population for ns %s: %v", path, err)
  316. return nil
  317. }
  318. for _, l := range links {
  319. if l.Type() == "vxlan" {
  320. vniTbl[uint32(l.(*netlink.Vxlan).VxlanId)] = path
  321. }
  322. }
  323. return nil
  324. })
  325. }
  326. func (n *network) generateVxlanName(s *subnet) string {
  327. id := n.id
  328. if len(n.id) > 5 {
  329. id = n.id[:5]
  330. }
  331. return fmt.Sprintf("vx-%06x-%v", s.vni, id)
  332. }
  333. func (n *network) generateBridgeName(s *subnet) string {
  334. id := n.id
  335. if len(n.id) > 5 {
  336. id = n.id[:5]
  337. }
  338. return n.getBridgeNamePrefix(s) + "-" + id
  339. }
  340. func (n *network) getBridgeNamePrefix(s *subnet) string {
  341. return fmt.Sprintf("ov-%06x", s.vni)
  342. }
  343. func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error {
  344. // Try to find this subnet's vni is being used in some
  345. // other namespace by looking at vniTbl that we just
  346. // populated in the once init. If a hit is found then
  347. // it must a stale namespace from previous
  348. // life. Destroy it completely and reclaim resourced.
  349. networkMu.Lock()
  350. path, ok := vniTbl[s.vni]
  351. networkMu.Unlock()
  352. if ok {
  353. deleteVxlanByVNI(path, s.vni)
  354. if err := unix.Unmount(path, unix.MNT_FORCE); err != nil {
  355. log.G(context.TODO()).Errorf("unmount of %s failed: %v", path, err)
  356. }
  357. os.Remove(path)
  358. networkMu.Lock()
  359. delete(vniTbl, s.vni)
  360. networkMu.Unlock()
  361. }
  362. // create a bridge and vxlan device for this subnet and move it to the sandbox
  363. sbox := n.sbox
  364. if err := sbox.AddInterface(brName, "br", osl.WithIPv4Address(s.gwIP), osl.WithIsBridge(true)); err != nil {
  365. return fmt.Errorf("bridge creation in sandbox failed for subnet %q: %v", s.subnetIP.String(), err)
  366. }
  367. err := createVxlan(vxlanName, s.vni, n.maxMTU())
  368. if err != nil {
  369. return err
  370. }
  371. if err := sbox.AddInterface(vxlanName, "vxlan", osl.WithMaster(brName)); err != nil {
  372. // If adding vxlan device to the overlay namespace fails, remove the bridge interface we
  373. // already added to the namespace. This allows the caller to try the setup again.
  374. for _, iface := range sbox.Interfaces() {
  375. if iface.SrcName() == brName {
  376. if ierr := iface.Remove(); ierr != nil {
  377. log.G(context.TODO()).Errorf("removing bridge failed from ov ns %v failed, %v", n.sbox.Key(), ierr)
  378. }
  379. }
  380. }
  381. // Also, delete the vxlan interface. Since a global vni id is associated
  382. // with the vxlan interface, an orphaned vxlan interface will result in
  383. // failure of vxlan device creation if the vni is assigned to some other
  384. // network.
  385. if deleteErr := deleteInterface(vxlanName); deleteErr != nil {
  386. log.G(context.TODO()).Warnf("could not delete vxlan interface, %s, error %v, after config error, %v", vxlanName, deleteErr, err)
  387. }
  388. return fmt.Errorf("vxlan interface creation failed for subnet %q: %v", s.subnetIP.String(), err)
  389. }
  390. if err := setDefaultVLAN(sbox); err != nil {
  391. // not a fatal error
  392. log.G(context.TODO()).WithError(err).Error("set bridge default vlan failed")
  393. }
  394. return nil
  395. }
  396. func setDefaultVLAN(ns *osl.Namespace) error {
  397. var brName string
  398. for _, i := range ns.Interfaces() {
  399. if i.Bridge() {
  400. brName = i.DstName()
  401. }
  402. }
  403. // IFLA_BR_VLAN_DEFAULT_PVID was added in Linux v4.4 (see torvalds/linux@0f963b7), so we can't use netlink for
  404. // setting this until Docker drops support for CentOS/RHEL 7 (kernel 3.10, eol date: 2024-06-30).
  405. var innerErr error
  406. err := ns.InvokeFunc(func() {
  407. // Contrary to what the sysfs(5) man page says, the entries of /sys/class/net
  408. // represent the networking devices visible in the network namespace of the
  409. // process which mounted the sysfs filesystem, irrespective of the network
  410. // namespace of the process accessing the directory. Remount sysfs in order to
  411. // see the network devices in sbox's network namespace, making sure the mount
  412. // doesn't propagate back.
  413. //
  414. // The Linux implementation of (osl.Sandbox).InvokeFunc() runs the function in a
  415. // dedicated goroutine. The effects of unshare(CLONE_NEWNS) on a thread cannot
  416. // be reverted so the thread needs to be terminated once the goroutine is
  417. // finished.
  418. runtime.LockOSThread()
  419. if err := unix.Unshare(unix.CLONE_NEWNS); err != nil {
  420. innerErr = os.NewSyscallError("unshare", err)
  421. return
  422. }
  423. if err := unix.Mount("", "/", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
  424. innerErr = &os.PathError{Op: "mount", Path: "/", Err: err}
  425. return
  426. }
  427. if err := unix.Mount("sysfs", "/sys", "sysfs", 0, ""); err != nil {
  428. innerErr = &os.PathError{Op: "mount", Path: "/sys", Err: err}
  429. return
  430. }
  431. path := filepath.Join("/sys/class/net", brName, "bridge/default_pvid")
  432. data := []byte{'0', '\n'}
  433. if err := os.WriteFile(path, data, 0o644); err != nil {
  434. innerErr = fmt.Errorf("failed to enable default vlan on bridge %s: %w", brName, err)
  435. return
  436. }
  437. })
  438. if err != nil {
  439. return err
  440. }
  441. return innerErr
  442. }
  443. // Must be called with the network lock
  444. func (n *network) initSubnetSandbox(s *subnet) error {
  445. brName := n.generateBridgeName(s)
  446. vxlanName := n.generateVxlanName(s)
  447. // Program iptables rules for mandatory encryption of the secure
  448. // network, or clean up leftover rules for a stale secure network which
  449. // was previously assigned the same VNI.
  450. if err := programMangle(s.vni, n.secure); err != nil {
  451. return err
  452. }
  453. if err := programInput(s.vni, n.secure); err != nil {
  454. if n.secure {
  455. return multierror.Append(err, programMangle(s.vni, false))
  456. }
  457. }
  458. if err := n.setupSubnetSandbox(s, brName, vxlanName); err != nil {
  459. return err
  460. }
  461. s.vxlanName = vxlanName
  462. s.brName = brName
  463. return nil
  464. }
  465. func (n *network) cleanupStaleSandboxes() {
  466. filepath.WalkDir(filepath.Dir(osl.GenerateKey("walk")),
  467. func(path string, _ os.DirEntry, _ error) error {
  468. _, fname := filepath.Split(path)
  469. pList := strings.Split(fname, "-")
  470. if len(pList) <= 1 {
  471. return nil
  472. }
  473. pattern := pList[1]
  474. if strings.Contains(n.id, pattern) {
  475. // Delete all vnis
  476. deleteVxlanByVNI(path, 0)
  477. unix.Unmount(path, unix.MNT_DETACH)
  478. os.Remove(path)
  479. // Now that we have destroyed this
  480. // sandbox, remove all references to
  481. // it in vniTbl so that we don't
  482. // inadvertently destroy the sandbox
  483. // created in this life.
  484. networkMu.Lock()
  485. for vni, tblPath := range vniTbl {
  486. if tblPath == path {
  487. delete(vniTbl, vni)
  488. }
  489. }
  490. networkMu.Unlock()
  491. }
  492. return nil
  493. })
  494. }
  495. func (n *network) initSandbox() error {
  496. n.initEpoch++
  497. // If there are any stale sandboxes related to this network
  498. // from previous daemon life clean it up here
  499. n.cleanupStaleSandboxes()
  500. key := osl.GenerateKey(fmt.Sprintf("%d-", n.initEpoch) + n.id)
  501. sbox, err := osl.NewSandbox(key, true, false)
  502. if err != nil {
  503. return fmt.Errorf("could not get network sandbox: %v", err)
  504. }
  505. // this is needed to let the peerAdd configure the sandbox
  506. n.sbox = sbox
  507. return nil
  508. }
  509. func (d *driver) network(nid string) *network {
  510. d.Lock()
  511. n := d.networks[nid]
  512. d.Unlock()
  513. return n
  514. }
  515. func (n *network) sandbox() *osl.Namespace {
  516. n.Lock()
  517. defer n.Unlock()
  518. return n.sbox
  519. }
  520. // getSubnetforIP returns the subnet to which the given IP belongs
  521. func (n *network) getSubnetforIP(ip *net.IPNet) *subnet {
  522. for _, s := range n.subnets {
  523. // first check if the mask lengths are the same
  524. i, _ := s.subnetIP.Mask.Size()
  525. j, _ := ip.Mask.Size()
  526. if i != j {
  527. continue
  528. }
  529. if s.subnetIP.Contains(ip.IP) {
  530. return s
  531. }
  532. }
  533. return nil
  534. }