peerdb.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. //go:build linux
  2. package overlay
  3. import (
  4. "context"
  5. "fmt"
  6. "net"
  7. "sync"
  8. "syscall"
  9. "github.com/containerd/log"
  10. "github.com/docker/docker/libnetwork/internal/setmatrix"
  11. "github.com/docker/docker/libnetwork/osl"
  12. )
  13. const ovPeerTable = "overlay_peer_table"
  14. type peerKey struct {
  15. peerIP net.IP
  16. peerMac net.HardwareAddr
  17. }
  18. type peerEntry struct {
  19. eid string
  20. vtep net.IP
  21. peerIPMask net.IPMask
  22. isLocal bool
  23. }
  24. func (p *peerEntry) MarshalDB() peerEntryDB {
  25. ones, bits := p.peerIPMask.Size()
  26. return peerEntryDB{
  27. eid: p.eid,
  28. vtep: p.vtep.String(),
  29. peerIPMaskOnes: ones,
  30. peerIPMaskBits: bits,
  31. isLocal: p.isLocal,
  32. }
  33. }
  34. // This the structure saved into the set (SetMatrix), due to the implementation of it
  35. // the value inserted in the set has to be Hashable so the []byte had to be converted into
  36. // strings
  37. type peerEntryDB struct {
  38. eid string
  39. vtep string
  40. peerIPMaskOnes int
  41. peerIPMaskBits int
  42. isLocal bool
  43. }
  44. func (p *peerEntryDB) UnMarshalDB() peerEntry {
  45. return peerEntry{
  46. eid: p.eid,
  47. vtep: net.ParseIP(p.vtep),
  48. peerIPMask: net.CIDRMask(p.peerIPMaskOnes, p.peerIPMaskBits),
  49. isLocal: p.isLocal,
  50. }
  51. }
  52. type peerMap struct {
  53. // set of peerEntry, note the values have to be objects and not pointers to maintain the proper equality checks
  54. mp setmatrix.SetMatrix[peerEntryDB]
  55. sync.Mutex
  56. }
  57. type peerNetworkMap struct {
  58. // map with key peerKey
  59. mp map[string]*peerMap
  60. sync.Mutex
  61. }
  62. func (pKey peerKey) String() string {
  63. return fmt.Sprintf("%s %s", pKey.peerIP, pKey.peerMac)
  64. }
  65. func (pKey *peerKey) Scan(state fmt.ScanState, verb rune) error {
  66. ipB, err := state.Token(true, nil)
  67. if err != nil {
  68. return err
  69. }
  70. pKey.peerIP = net.ParseIP(string(ipB))
  71. macB, err := state.Token(true, nil)
  72. if err != nil {
  73. return err
  74. }
  75. pKey.peerMac, err = net.ParseMAC(string(macB))
  76. return err
  77. }
  78. func (d *driver) peerDbWalk(f func(string, *peerKey, *peerEntry) bool) error {
  79. d.peerDb.Lock()
  80. nids := []string{}
  81. for nid := range d.peerDb.mp {
  82. nids = append(nids, nid)
  83. }
  84. d.peerDb.Unlock()
  85. for _, nid := range nids {
  86. d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool {
  87. return f(nid, pKey, pEntry)
  88. })
  89. }
  90. return nil
  91. }
  92. func (d *driver) peerDbNetworkWalk(nid string, f func(*peerKey, *peerEntry) bool) error {
  93. d.peerDb.Lock()
  94. pMap, ok := d.peerDb.mp[nid]
  95. d.peerDb.Unlock()
  96. if !ok {
  97. return nil
  98. }
  99. mp := map[string]peerEntry{}
  100. pMap.Lock()
  101. for _, pKeyStr := range pMap.mp.Keys() {
  102. entryDBList, ok := pMap.mp.Get(pKeyStr)
  103. if ok {
  104. peerEntryDB := entryDBList[0]
  105. mp[pKeyStr] = peerEntryDB.UnMarshalDB()
  106. }
  107. }
  108. pMap.Unlock()
  109. for pKeyStr, pEntry := range mp {
  110. var pKey peerKey
  111. pEntry := pEntry
  112. if _, err := fmt.Sscan(pKeyStr, &pKey); err != nil {
  113. log.G(context.TODO()).Warnf("Peer key scan on network %s failed: %v", nid, err)
  114. }
  115. if f(&pKey, &pEntry) {
  116. return nil
  117. }
  118. }
  119. return nil
  120. }
  121. func (d *driver) peerDbSearch(nid string, peerIP net.IP) (*peerKey, *peerEntry, error) {
  122. var pKeyMatched *peerKey
  123. var pEntryMatched *peerEntry
  124. err := d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool {
  125. if pKey.peerIP.Equal(peerIP) {
  126. pKeyMatched = pKey
  127. pEntryMatched = pEntry
  128. return true
  129. }
  130. return false
  131. })
  132. if err != nil {
  133. return nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err)
  134. }
  135. if pKeyMatched == nil || pEntryMatched == nil {
  136. return nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP)
  137. }
  138. return pKeyMatched, pEntryMatched, nil
  139. }
  140. func (d *driver) peerDbAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) {
  141. d.peerDb.Lock()
  142. pMap, ok := d.peerDb.mp[nid]
  143. if !ok {
  144. pMap = &peerMap{}
  145. d.peerDb.mp[nid] = pMap
  146. }
  147. d.peerDb.Unlock()
  148. pKey := peerKey{
  149. peerIP: peerIP,
  150. peerMac: peerMac,
  151. }
  152. pEntry := peerEntry{
  153. eid: eid,
  154. vtep: vtep,
  155. peerIPMask: peerIPMask,
  156. isLocal: isLocal,
  157. }
  158. pMap.Lock()
  159. defer pMap.Unlock()
  160. b, i := pMap.mp.Insert(pKey.String(), pEntry.MarshalDB())
  161. if i != 1 {
  162. // Transient case, there is more than one endpoint that is using the same IP,MAC pair
  163. s, _ := pMap.mp.String(pKey.String())
  164. log.G(context.TODO()).Warnf("peerDbAdd transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s)
  165. }
  166. return b, i
  167. }
  168. func (d *driver) peerDbDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) {
  169. d.peerDb.Lock()
  170. pMap, ok := d.peerDb.mp[nid]
  171. if !ok {
  172. d.peerDb.Unlock()
  173. return false, 0
  174. }
  175. d.peerDb.Unlock()
  176. pKey := peerKey{
  177. peerIP: peerIP,
  178. peerMac: peerMac,
  179. }
  180. pEntry := peerEntry{
  181. eid: eid,
  182. vtep: vtep,
  183. peerIPMask: peerIPMask,
  184. isLocal: isLocal,
  185. }
  186. pMap.Lock()
  187. defer pMap.Unlock()
  188. b, i := pMap.mp.Remove(pKey.String(), pEntry.MarshalDB())
  189. if i != 0 {
  190. // Transient case, there is more than one endpoint that is using the same IP,MAC pair
  191. s, _ := pMap.mp.String(pKey.String())
  192. log.G(context.TODO()).Warnf("peerDbDelete transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s)
  193. }
  194. return b, i
  195. }
  196. // The overlay uses a lazy initialization approach, this means that when a network is created
  197. // and the driver registered the overlay does not allocate resources till the moment that a
  198. // sandbox is actually created.
  199. // At the moment of this call, that happens when a sandbox is initialized, is possible that
  200. // networkDB has already delivered some events of peers already available on remote nodes,
  201. // these peers are saved into the peerDB and this function is used to properly configure
  202. // the network sandbox with all those peers that got previously notified.
  203. // Note also that this method sends a single message on the channel and the go routine on the
  204. // other side, will atomically loop on the whole table of peers and will program their state
  205. // in one single atomic operation. This is fundamental to guarantee consistency, and avoid that
  206. // new peerAdd or peerDelete gets reordered during the sandbox init.
  207. func (d *driver) initSandboxPeerDB(nid string) {
  208. d.peerOpMu.Lock()
  209. defer d.peerOpMu.Unlock()
  210. if err := d.peerInitOp(nid); err != nil {
  211. log.G(context.TODO()).WithError(err).Warn("Peer init operation failed")
  212. }
  213. }
  214. func (d *driver) peerInitOp(nid string) error {
  215. return d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool {
  216. // Local entries do not need to be added
  217. if pEntry.isLocal {
  218. return false
  219. }
  220. d.peerAddOp(nid, pEntry.eid, pKey.peerIP, pEntry.peerIPMask, pKey.peerMac, pEntry.vtep, false, false, false, pEntry.isLocal)
  221. // return false to loop on all entries
  222. return false
  223. })
  224. }
  225. func (d *driver) peerAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, localPeer bool) {
  226. d.peerOpMu.Lock()
  227. defer d.peerOpMu.Unlock()
  228. err := d.peerAddOp(nid, eid, peerIP, peerIPMask, peerMac, vtep, l2Miss, l3Miss, true, localPeer)
  229. if err != nil {
  230. log.G(context.TODO()).WithError(err).Warn("Peer add operation failed")
  231. }
  232. }
  233. func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, updateDB, localPeer bool) error {
  234. if err := validateID(nid, eid); err != nil {
  235. return err
  236. }
  237. var dbEntries int
  238. var inserted bool
  239. if updateDB {
  240. inserted, dbEntries = d.peerDbAdd(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer)
  241. if !inserted {
  242. log.G(context.TODO()).Warnf("Entry already present in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v",
  243. nid, eid, peerIP, peerMac, localPeer, vtep)
  244. }
  245. }
  246. // Local peers do not need any further configuration
  247. if localPeer {
  248. return nil
  249. }
  250. n := d.network(nid)
  251. if n == nil {
  252. return nil
  253. }
  254. sbox := n.sandbox()
  255. if sbox == nil {
  256. // We are hitting this case for all the events that are arriving before that the sandbox
  257. // is being created. The peer got already added into the database and the sanbox init will
  258. // call the peerDbUpdateSandbox that will configure all these peers from the database
  259. return nil
  260. }
  261. IP := &net.IPNet{
  262. IP: peerIP,
  263. Mask: peerIPMask,
  264. }
  265. s := n.getSubnetforIP(IP)
  266. if s == nil {
  267. return fmt.Errorf("couldn't find the subnet %q in network %q", IP.String(), n.id)
  268. }
  269. if err := n.joinSandbox(s, false); err != nil {
  270. return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err)
  271. }
  272. if err := d.checkEncryption(nid, vtep, false, true); err != nil {
  273. log.G(context.TODO()).Warn(err)
  274. }
  275. // Add neighbor entry for the peer IP
  276. if err := sbox.AddNeighbor(peerIP, peerMac, l3Miss, osl.WithLinkName(s.vxlanName)); err != nil {
  277. if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 1 {
  278. // We are in the transient case so only the first configuration is programmed into the kernel
  279. // Upon deletion if the active configuration is deleted the next one from the database will be restored
  280. // Note we are skipping also the next configuration
  281. return nil
  282. }
  283. return fmt.Errorf("could not add neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
  284. }
  285. // Add fdb entry to the bridge for the peer mac
  286. if err := sbox.AddNeighbor(vtep, peerMac, l2Miss, osl.WithLinkName(s.vxlanName), osl.WithFamily(syscall.AF_BRIDGE)); err != nil {
  287. return fmt.Errorf("could not add fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
  288. }
  289. return nil
  290. }
  291. func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, peerMac net.HardwareAddr, vtep net.IP, localPeer bool) {
  292. d.peerOpMu.Lock()
  293. defer d.peerOpMu.Unlock()
  294. err := d.peerDeleteOp(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer)
  295. if err != nil {
  296. log.G(context.TODO()).WithError(err).Warn("Peer delete operation failed")
  297. }
  298. }
  299. func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, peerMac net.HardwareAddr, vtep net.IP, localPeer bool) error {
  300. if err := validateID(nid, eid); err != nil {
  301. return err
  302. }
  303. deleted, dbEntries := d.peerDbDelete(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer)
  304. if !deleted {
  305. log.G(context.TODO()).Warnf("Entry was not in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v",
  306. nid, eid, peerIP, peerMac, localPeer, vtep)
  307. }
  308. n := d.network(nid)
  309. if n == nil {
  310. return nil
  311. }
  312. sbox := n.sandbox()
  313. if sbox == nil {
  314. return nil
  315. }
  316. if err := d.checkEncryption(nid, vtep, localPeer, false); err != nil {
  317. log.G(context.TODO()).Warn(err)
  318. }
  319. // Local peers do not have any local configuration to delete
  320. if !localPeer {
  321. // Remove fdb entry to the bridge for the peer mac
  322. if err := sbox.DeleteNeighbor(vtep, peerMac); err != nil {
  323. if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 0 {
  324. // We fall in here if there is a transient state and if the neighbor that is being deleted
  325. // was never been configured into the kernel (we allow only 1 configuration at the time per <ip,mac> mapping)
  326. return nil
  327. }
  328. return fmt.Errorf("could not delete fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
  329. }
  330. // Delete neighbor entry for the peer IP
  331. if err := sbox.DeleteNeighbor(peerIP, peerMac); err != nil {
  332. return fmt.Errorf("could not delete neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
  333. }
  334. }
  335. if dbEntries == 0 {
  336. return nil
  337. }
  338. // If there is still an entry into the database and the deletion went through without errors means that there is now no
  339. // configuration active in the kernel.
  340. // Restore one configuration for the <ip,mac> directly from the database, note that is guaranteed that there is one
  341. peerKey, peerEntry, err := d.peerDbSearch(nid, peerIP)
  342. if err != nil {
  343. log.G(context.TODO()).Errorf("peerDeleteOp unable to restore a configuration for nid:%s ip:%v mac:%v err:%s", nid, peerIP, peerMac, err)
  344. return err
  345. }
  346. return d.peerAddOp(nid, peerEntry.eid, peerIP, peerEntry.peerIPMask, peerKey.peerMac, peerEntry.vtep, false, false, false, peerEntry.isLocal)
  347. }
  348. func (d *driver) peerFlush(nid string) {
  349. d.peerOpMu.Lock()
  350. defer d.peerOpMu.Unlock()
  351. if err := d.peerFlushOp(nid); err != nil {
  352. log.G(context.TODO()).WithError(err).Warn("Peer flush operation failed")
  353. }
  354. }
  355. func (d *driver) peerFlushOp(nid string) error {
  356. d.peerDb.Lock()
  357. defer d.peerDb.Unlock()
  358. _, ok := d.peerDb.mp[nid]
  359. if !ok {
  360. return fmt.Errorf("Unable to find the peerDB for nid:%s", nid)
  361. }
  362. delete(d.peerDb.mp, nid)
  363. return nil
  364. }
  365. func (d *driver) peerDBUpdateSelf() {
  366. d.peerDbWalk(func(nid string, pkey *peerKey, pEntry *peerEntry) bool {
  367. if pEntry.isLocal {
  368. pEntry.vtep = d.advertiseAddress
  369. }
  370. return false
  371. })
  372. }