ov_network.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664
  1. package overlay
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "net"
  6. "os"
  7. "path/filepath"
  8. "strings"
  9. "sync"
  10. "syscall"
  11. "github.com/Sirupsen/logrus"
  12. "github.com/docker/libnetwork/datastore"
  13. "github.com/docker/libnetwork/driverapi"
  14. "github.com/docker/libnetwork/netutils"
  15. "github.com/docker/libnetwork/osl"
  16. "github.com/docker/libnetwork/resolvconf"
  17. "github.com/docker/libnetwork/types"
  18. "github.com/vishvananda/netlink"
  19. "github.com/vishvananda/netlink/nl"
  20. )
  21. var (
  22. hostMode bool
  23. hostModeOnce sync.Once
  24. )
  25. type networkTable map[string]*network
  26. type subnet struct {
  27. once *sync.Once
  28. vxlanName string
  29. brName string
  30. vni uint32
  31. initErr error
  32. subnetIP *net.IPNet
  33. gwIP *net.IPNet
  34. }
  35. type subnetJSON struct {
  36. SubnetIP string
  37. GwIP string
  38. Vni uint32
  39. }
  40. type network struct {
  41. id string
  42. dbIndex uint64
  43. dbExists bool
  44. sbox osl.Sandbox
  45. endpoints endpointTable
  46. driver *driver
  47. joinCnt int
  48. once *sync.Once
  49. initEpoch int
  50. initErr error
  51. subnets []*subnet
  52. sync.Mutex
  53. }
  54. func (d *driver) CreateNetwork(id string, option map[string]interface{}, ipV4Data, ipV6Data []driverapi.IPAMData) error {
  55. if id == "" {
  56. return fmt.Errorf("invalid network id")
  57. }
  58. if len(ipV4Data) == 0 || ipV4Data[0].Pool.String() == "0.0.0.0/0" {
  59. return types.BadRequestErrorf("ipv4 pool is empty")
  60. }
  61. // Since we perform lazy configuration make sure we try
  62. // configuring the driver when we enter CreateNetwork
  63. if err := d.configure(); err != nil {
  64. return err
  65. }
  66. n := &network{
  67. id: id,
  68. driver: d,
  69. endpoints: endpointTable{},
  70. once: &sync.Once{},
  71. subnets: []*subnet{},
  72. }
  73. for _, ipd := range ipV4Data {
  74. s := &subnet{
  75. subnetIP: ipd.Pool,
  76. gwIP: ipd.Gateway,
  77. once: &sync.Once{},
  78. }
  79. n.subnets = append(n.subnets, s)
  80. }
  81. if err := n.writeToStore(); err != nil {
  82. return fmt.Errorf("failed to update data store for network %v: %v", n.id, err)
  83. }
  84. d.addNetwork(n)
  85. return nil
  86. }
  87. func (d *driver) DeleteNetwork(nid string) error {
  88. if nid == "" {
  89. return fmt.Errorf("invalid network id")
  90. }
  91. n := d.network(nid)
  92. if n == nil {
  93. return fmt.Errorf("could not find network with id %s", nid)
  94. }
  95. d.deleteNetwork(nid)
  96. return n.releaseVxlanID()
  97. }
  98. func (n *network) incEndpointCount() {
  99. n.Lock()
  100. defer n.Unlock()
  101. n.joinCnt++
  102. }
  103. func (n *network) joinSandbox() error {
  104. // If there is a race between two go routines here only one will win
  105. // the other will wait.
  106. n.once.Do(func() {
  107. // save the error status of initSandbox in n.initErr so that
  108. // all the racing go routines are able to know the status.
  109. n.initErr = n.initSandbox()
  110. })
  111. return n.initErr
  112. }
  113. func (n *network) joinSubnetSandbox(s *subnet) error {
  114. s.once.Do(func() {
  115. s.initErr = n.initSubnetSandbox(s)
  116. })
  117. return s.initErr
  118. }
  119. func (n *network) leaveSandbox() {
  120. n.Lock()
  121. n.joinCnt--
  122. if n.joinCnt != 0 {
  123. n.Unlock()
  124. return
  125. }
  126. // We are about to destroy sandbox since the container is leaving the network
  127. // Reinitialize the once variable so that we will be able to trigger one time
  128. // sandbox initialization(again) when another container joins subsequently.
  129. n.once = &sync.Once{}
  130. for _, s := range n.subnets {
  131. s.once = &sync.Once{}
  132. }
  133. n.Unlock()
  134. n.destroySandbox()
  135. }
  136. func (n *network) destroySandbox() {
  137. sbox := n.sandbox()
  138. if sbox != nil {
  139. for _, iface := range sbox.Info().Interfaces() {
  140. if err := iface.Remove(); err != nil {
  141. logrus.Debugf("Remove interface %s failed: %v", iface.SrcName(), err)
  142. }
  143. }
  144. for _, s := range n.subnets {
  145. if hostMode {
  146. if err := removeFilters(n.id[:12], s.brName); err != nil {
  147. logrus.Warnf("Could not remove overlay filters: %v", err)
  148. }
  149. }
  150. if s.vxlanName != "" {
  151. err := deleteInterface(s.vxlanName)
  152. if err != nil {
  153. logrus.Warnf("could not cleanup sandbox properly: %v", err)
  154. }
  155. }
  156. }
  157. if hostMode {
  158. if err := removeNetworkChain(n.id[:12]); err != nil {
  159. logrus.Warnf("could not remove network chain: %v", err)
  160. }
  161. }
  162. sbox.Destroy()
  163. n.setSandbox(nil)
  164. }
  165. }
  166. func setHostMode() {
  167. if os.Getenv("_OVERLAY_HOST_MODE") != "" {
  168. hostMode = true
  169. return
  170. }
  171. err := createVxlan("testvxlan", 1)
  172. if err != nil {
  173. logrus.Errorf("Failed to create testvxlan interface: %v", err)
  174. return
  175. }
  176. defer deleteInterface("testvxlan")
  177. path := "/proc/self/ns/net"
  178. f, err := os.OpenFile(path, os.O_RDONLY, 0)
  179. if err != nil {
  180. logrus.Errorf("Failed to open path %s for network namespace for setting host mode: %v", path, err)
  181. return
  182. }
  183. defer f.Close()
  184. nsFD := f.Fd()
  185. iface, err := netlink.LinkByName("testvxlan")
  186. if err != nil {
  187. logrus.Errorf("Failed to get link testvxlan: %v", err)
  188. return
  189. }
  190. // If we are not able to move the vxlan interface to a namespace
  191. // then fallback to host mode
  192. if err := netlink.LinkSetNsFd(iface, int(nsFD)); err != nil {
  193. hostMode = true
  194. }
  195. }
  196. func (n *network) generateVxlanName(s *subnet) string {
  197. return "vx-" + fmt.Sprintf("%06x", n.vxlanID(s)) + "-" + n.id[:5]
  198. }
  199. func (n *network) generateBridgeName(s *subnet) string {
  200. return "ov-" + fmt.Sprintf("%06x", n.vxlanID(s)) + "-" + n.id[:5]
  201. }
  202. func isOverlap(nw *net.IPNet) bool {
  203. var nameservers []string
  204. if rc, err := resolvconf.Get(); err == nil {
  205. nameservers = resolvconf.GetNameserversAsCIDR(rc.Content)
  206. }
  207. if err := netutils.CheckNameserverOverlaps(nameservers, nw); err != nil {
  208. return true
  209. }
  210. if err := netutils.CheckRouteOverlaps(nw); err != nil {
  211. return true
  212. }
  213. return false
  214. }
  215. func (n *network) initSubnetSandbox(s *subnet) error {
  216. brName := n.generateBridgeName(s)
  217. vxlanName := n.generateVxlanName(s)
  218. if hostMode {
  219. // Try to delete stale bridge interface if it exists
  220. deleteInterface(brName)
  221. // Try to delete the vxlan interface by vni if already present
  222. deleteVxlanByVNI(n.vxlanID(s))
  223. if isOverlap(s.subnetIP) {
  224. return fmt.Errorf("overlay subnet %s has conflicts in the host while running in host mode", s.subnetIP.String())
  225. }
  226. }
  227. // create a bridge and vxlan device for this subnet and move it to the sandbox
  228. sbox := n.sandbox()
  229. if err := sbox.AddInterface(brName, "br",
  230. sbox.InterfaceOptions().Address(s.gwIP),
  231. sbox.InterfaceOptions().Bridge(true)); err != nil {
  232. return fmt.Errorf("bridge creation in sandbox failed for subnet %q: %v", s.subnetIP.String(), err)
  233. }
  234. err := createVxlan(vxlanName, n.vxlanID(s))
  235. if err != nil {
  236. return err
  237. }
  238. if err := sbox.AddInterface(vxlanName, "vxlan",
  239. sbox.InterfaceOptions().Master(brName)); err != nil {
  240. return fmt.Errorf("vxlan interface creation failed for subnet %q: %v", s.subnetIP.String(), err)
  241. }
  242. if hostMode {
  243. if err := addFilters(n.id[:12], brName); err != nil {
  244. return err
  245. }
  246. }
  247. n.Lock()
  248. s.vxlanName = vxlanName
  249. s.brName = brName
  250. n.Unlock()
  251. return nil
  252. }
  253. func (n *network) cleanupStaleSandboxes() {
  254. filepath.Walk(filepath.Dir(osl.GenerateKey("walk")),
  255. func(path string, info os.FileInfo, err error) error {
  256. _, fname := filepath.Split(path)
  257. pList := strings.Split(fname, "-")
  258. if len(pList) <= 1 {
  259. return nil
  260. }
  261. pattern := pList[1]
  262. if strings.Contains(n.id, pattern) {
  263. syscall.Unmount(path, syscall.MNT_DETACH)
  264. os.Remove(path)
  265. }
  266. return nil
  267. })
  268. }
  269. func (n *network) initSandbox() error {
  270. n.Lock()
  271. n.initEpoch++
  272. n.Unlock()
  273. hostModeOnce.Do(setHostMode)
  274. if hostMode {
  275. if err := addNetworkChain(n.id[:12]); err != nil {
  276. return err
  277. }
  278. }
  279. // If there are any stale sandboxes related to this network
  280. // from previous daemon life clean it up here
  281. n.cleanupStaleSandboxes()
  282. sbox, err := osl.NewSandbox(
  283. osl.GenerateKey(fmt.Sprintf("%d-", n.initEpoch)+n.id), !hostMode)
  284. if err != nil {
  285. return fmt.Errorf("could not create network sandbox: %v", err)
  286. }
  287. n.setSandbox(sbox)
  288. n.driver.peerDbUpdateSandbox(n.id)
  289. var nlSock *nl.NetlinkSocket
  290. sbox.InvokeFunc(func() {
  291. nlSock, err = nl.Subscribe(syscall.NETLINK_ROUTE, syscall.RTNLGRP_NEIGH)
  292. if err != nil {
  293. err = fmt.Errorf("failed to subscribe to neighbor group netlink messages")
  294. }
  295. })
  296. go n.watchMiss(nlSock)
  297. return nil
  298. }
  299. func (n *network) watchMiss(nlSock *nl.NetlinkSocket) {
  300. for {
  301. msgs, err := nlSock.Receive()
  302. if err != nil {
  303. logrus.Errorf("Failed to receive from netlink: %v ", err)
  304. continue
  305. }
  306. for _, msg := range msgs {
  307. if msg.Header.Type != syscall.RTM_GETNEIGH && msg.Header.Type != syscall.RTM_NEWNEIGH {
  308. continue
  309. }
  310. neigh, err := netlink.NeighDeserialize(msg.Data)
  311. if err != nil {
  312. logrus.Errorf("Failed to deserialize netlink ndmsg: %v", err)
  313. continue
  314. }
  315. if neigh.IP.To16() != nil {
  316. continue
  317. }
  318. if neigh.State&(netlink.NUD_STALE|netlink.NUD_INCOMPLETE) == 0 {
  319. continue
  320. }
  321. mac, IPmask, vtep, err := n.driver.resolvePeer(n.id, neigh.IP)
  322. if err != nil {
  323. logrus.Errorf("could not resolve peer %q: %v", neigh.IP, err)
  324. continue
  325. }
  326. if err := n.driver.peerAdd(n.id, "dummy", neigh.IP, IPmask, mac, vtep, true); err != nil {
  327. logrus.Errorf("could not add neighbor entry for missed peer %q: %v", neigh.IP, err)
  328. }
  329. }
  330. }
  331. }
  332. func (d *driver) addNetwork(n *network) {
  333. d.Lock()
  334. d.networks[n.id] = n
  335. d.Unlock()
  336. }
  337. func (d *driver) deleteNetwork(nid string) {
  338. d.Lock()
  339. delete(d.networks, nid)
  340. d.Unlock()
  341. }
  342. func (d *driver) network(nid string) *network {
  343. d.Lock()
  344. networks := d.networks
  345. d.Unlock()
  346. n, ok := networks[nid]
  347. if !ok {
  348. n = d.getNetworkFromStore(nid)
  349. if n != nil {
  350. n.driver = d
  351. n.endpoints = endpointTable{}
  352. n.once = &sync.Once{}
  353. networks[nid] = n
  354. }
  355. }
  356. return n
  357. }
  358. func (d *driver) getNetworkFromStore(nid string) *network {
  359. if d.store == nil {
  360. return nil
  361. }
  362. n := &network{id: nid}
  363. if err := d.store.GetObject(datastore.Key(n.Key()...), n); err != nil {
  364. return nil
  365. }
  366. return n
  367. }
  368. func (n *network) sandbox() osl.Sandbox {
  369. n.Lock()
  370. defer n.Unlock()
  371. return n.sbox
  372. }
  373. func (n *network) setSandbox(sbox osl.Sandbox) {
  374. n.Lock()
  375. n.sbox = sbox
  376. n.Unlock()
  377. }
  378. func (n *network) vxlanID(s *subnet) uint32 {
  379. n.Lock()
  380. defer n.Unlock()
  381. return s.vni
  382. }
  383. func (n *network) setVxlanID(s *subnet, vni uint32) {
  384. n.Lock()
  385. s.vni = vni
  386. n.Unlock()
  387. }
  388. func (n *network) Key() []string {
  389. return []string{"overlay", "network", n.id}
  390. }
  391. func (n *network) KeyPrefix() []string {
  392. return []string{"overlay", "network"}
  393. }
  394. func (n *network) Value() []byte {
  395. netJSON := []*subnetJSON{}
  396. for _, s := range n.subnets {
  397. sj := &subnetJSON{
  398. SubnetIP: s.subnetIP.String(),
  399. GwIP: s.gwIP.String(),
  400. Vni: s.vni,
  401. }
  402. netJSON = append(netJSON, sj)
  403. }
  404. b, err := json.Marshal(netJSON)
  405. if err != nil {
  406. return []byte{}
  407. }
  408. return b
  409. }
  410. func (n *network) Index() uint64 {
  411. return n.dbIndex
  412. }
  413. func (n *network) SetIndex(index uint64) {
  414. n.dbIndex = index
  415. n.dbExists = true
  416. }
  417. func (n *network) Exists() bool {
  418. return n.dbExists
  419. }
  420. func (n *network) Skip() bool {
  421. return false
  422. }
  423. func (n *network) SetValue(value []byte) error {
  424. var newNet bool
  425. netJSON := []*subnetJSON{}
  426. err := json.Unmarshal(value, &netJSON)
  427. if err != nil {
  428. return err
  429. }
  430. if len(n.subnets) == 0 {
  431. newNet = true
  432. }
  433. for _, sj := range netJSON {
  434. subnetIPstr := sj.SubnetIP
  435. gwIPstr := sj.GwIP
  436. vni := sj.Vni
  437. subnetIP, _ := types.ParseCIDR(subnetIPstr)
  438. gwIP, _ := types.ParseCIDR(gwIPstr)
  439. if newNet {
  440. s := &subnet{
  441. subnetIP: subnetIP,
  442. gwIP: gwIP,
  443. vni: vni,
  444. once: &sync.Once{},
  445. }
  446. n.subnets = append(n.subnets, s)
  447. } else {
  448. sNet := n.getMatchingSubnet(subnetIP)
  449. if sNet != nil {
  450. sNet.vni = vni
  451. }
  452. }
  453. }
  454. return nil
  455. }
  456. func (n *network) DataScope() string {
  457. return datastore.GlobalScope
  458. }
  459. func (n *network) writeToStore() error {
  460. return n.driver.store.PutObjectAtomic(n)
  461. }
  462. func (n *network) releaseVxlanID() error {
  463. if n.driver.store == nil {
  464. return fmt.Errorf("no datastore configured. cannot release vxlan id")
  465. }
  466. if len(n.subnets) == 0 {
  467. return nil
  468. }
  469. if err := n.driver.store.DeleteObjectAtomic(n); err != nil {
  470. if err == datastore.ErrKeyModified || err == datastore.ErrKeyNotFound {
  471. // In both the above cases we can safely assume that the key has been removed by some other
  472. // instance and so simply get out of here
  473. return nil
  474. }
  475. return fmt.Errorf("failed to delete network to vxlan id map: %v", err)
  476. }
  477. for _, s := range n.subnets {
  478. n.driver.vxlanIdm.Release(uint64(n.vxlanID(s)))
  479. n.setVxlanID(s, 0)
  480. }
  481. return nil
  482. }
  483. func (n *network) obtainVxlanID(s *subnet) error {
  484. //return if the subnet already has a vxlan id assigned
  485. if s.vni != 0 {
  486. return nil
  487. }
  488. if n.driver.store == nil {
  489. return fmt.Errorf("no datastore configured. cannot obtain vxlan id")
  490. }
  491. for {
  492. if err := n.driver.store.GetObject(datastore.Key(n.Key()...), n); err != nil {
  493. return fmt.Errorf("getting network %q from datastore failed %v", n.id, err)
  494. }
  495. if s.vni == 0 {
  496. vxlanID, err := n.driver.vxlanIdm.GetID()
  497. if err != nil {
  498. return fmt.Errorf("failed to allocate vxlan id: %v", err)
  499. }
  500. n.setVxlanID(s, uint32(vxlanID))
  501. if err := n.writeToStore(); err != nil {
  502. n.driver.vxlanIdm.Release(uint64(n.vxlanID(s)))
  503. n.setVxlanID(s, 0)
  504. if err == datastore.ErrKeyModified {
  505. continue
  506. }
  507. return fmt.Errorf("network %q failed to update data store: %v", n.id, err)
  508. }
  509. return nil
  510. }
  511. return nil
  512. }
  513. }
  514. // getSubnetforIP returns the subnet to which the given IP belongs
  515. func (n *network) getSubnetforIP(ip *net.IPNet) *subnet {
  516. for _, s := range n.subnets {
  517. // first check if the mask lengths are the same
  518. i, _ := s.subnetIP.Mask.Size()
  519. j, _ := ip.Mask.Size()
  520. if i != j {
  521. continue
  522. }
  523. if s.subnetIP.Contains(ip.IP) {
  524. return s
  525. }
  526. }
  527. return nil
  528. }
  529. // getMatchingSubnet return the network's subnet that matches the input
  530. func (n *network) getMatchingSubnet(ip *net.IPNet) *subnet {
  531. if ip == nil {
  532. return nil
  533. }
  534. for _, s := range n.subnets {
  535. // first check if the mask lengths are the same
  536. i, _ := s.subnetIP.Mask.Size()
  537. j, _ := ip.Mask.Size()
  538. if i != j {
  539. continue
  540. }
  541. if s.subnetIP.IP.Equal(ip.IP) {
  542. return s
  543. }
  544. }
  545. return nil
  546. }