delegate.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. package networkdb
  2. import (
  3. "net"
  4. "time"
  5. "github.com/gogo/protobuf/proto"
  6. "github.com/sirupsen/logrus"
  7. )
  8. type delegate struct {
  9. nDB *NetworkDB
  10. }
  11. func (d *delegate) NodeMeta(limit int) []byte {
  12. return []byte{}
  13. }
  14. func (nDB *NetworkDB) handleNodeEvent(nEvent *NodeEvent) bool {
  15. // Update our local clock if the received messages has newer
  16. // time.
  17. nDB.networkClock.Witness(nEvent.LTime)
  18. nDB.RLock()
  19. defer nDB.RUnlock()
  20. // check if the node exists
  21. n, _, _ := nDB.findNode(nEvent.NodeName)
  22. if n == nil {
  23. return false
  24. }
  25. // check if the event is fresh
  26. if n.ltime >= nEvent.LTime {
  27. return false
  28. }
  29. // If we are here means that the event is fresher and the node is known. Update the laport time
  30. n.ltime = nEvent.LTime
  31. // If it is a node leave event for a manager and this is the only manager we
  32. // know of we want the reconnect logic to kick in. In a single manager
  33. // cluster manager's gossip can't be bootstrapped unless some other node
  34. // connects to it.
  35. if len(nDB.bootStrapIP) == 1 && nEvent.Type == NodeEventTypeLeave {
  36. for _, ip := range nDB.bootStrapIP {
  37. if ip.Equal(n.Addr) {
  38. return true
  39. }
  40. }
  41. }
  42. switch nEvent.Type {
  43. case NodeEventTypeJoin:
  44. moved, err := nDB.changeNodeState(n.Name, nodeActiveState)
  45. if err != nil {
  46. logrus.WithError(err).Error("unable to find the node to move")
  47. return false
  48. }
  49. if moved {
  50. logrus.Infof("%v(%v): Node join event for %s/%s", nDB.config.Hostname, nDB.config.NodeID, n.Name, n.Addr)
  51. }
  52. return moved
  53. case NodeEventTypeLeave:
  54. moved, err := nDB.changeNodeState(n.Name, nodeLeftState)
  55. if err != nil {
  56. logrus.WithError(err).Error("unable to find the node to move")
  57. return false
  58. }
  59. if moved {
  60. logrus.Infof("%v(%v): Node leave event for %s/%s", nDB.config.Hostname, nDB.config.NodeID, n.Name, n.Addr)
  61. }
  62. return moved
  63. }
  64. return false
  65. }
  66. func (nDB *NetworkDB) handleNetworkEvent(nEvent *NetworkEvent) bool {
  67. // Update our local clock if the received messages has newer
  68. // time.
  69. nDB.networkClock.Witness(nEvent.LTime)
  70. nDB.Lock()
  71. defer nDB.Unlock()
  72. if nEvent.NodeName == nDB.config.NodeID {
  73. return false
  74. }
  75. nodeNetworks, ok := nDB.networks[nEvent.NodeName]
  76. if !ok {
  77. // We haven't heard about this node at all. Ignore the leave
  78. if nEvent.Type == NetworkEventTypeLeave {
  79. return false
  80. }
  81. nodeNetworks = make(map[string]*network)
  82. nDB.networks[nEvent.NodeName] = nodeNetworks
  83. }
  84. if n, ok := nodeNetworks[nEvent.NetworkID]; ok {
  85. // We have the latest state. Ignore the event
  86. // since it is stale.
  87. if n.ltime >= nEvent.LTime {
  88. return false
  89. }
  90. n.ltime = nEvent.LTime
  91. n.leaving = nEvent.Type == NetworkEventTypeLeave
  92. if n.leaving {
  93. n.reapTime = nDB.config.reapNetworkInterval
  94. // The remote node is leaving the network, but not the gossip cluster.
  95. // Mark all its entries in deleted state, this will guarantee that
  96. // if some node bulk sync with us, the deleted state of
  97. // these entries will be propagated.
  98. nDB.deleteNodeNetworkEntries(nEvent.NetworkID, nEvent.NodeName)
  99. }
  100. if nEvent.Type == NetworkEventTypeLeave {
  101. nDB.deleteNetworkNode(nEvent.NetworkID, nEvent.NodeName)
  102. } else {
  103. nDB.addNetworkNode(nEvent.NetworkID, nEvent.NodeName)
  104. }
  105. return true
  106. }
  107. if nEvent.Type == NetworkEventTypeLeave {
  108. return false
  109. }
  110. // If the node is not known from memberlist we cannot process save any state of it else if it actually
  111. // dies we won't receive any notification and we will remain stuck with it
  112. if _, ok := nDB.nodes[nEvent.NodeName]; !ok {
  113. return false
  114. }
  115. // This remote network join is being seen the first time.
  116. nodeNetworks[nEvent.NetworkID] = &network{
  117. id: nEvent.NetworkID,
  118. ltime: nEvent.LTime,
  119. }
  120. nDB.addNetworkNode(nEvent.NetworkID, nEvent.NodeName)
  121. return true
  122. }
  123. func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent) bool {
  124. // Update our local clock if the received messages has newer time.
  125. nDB.tableClock.Witness(tEvent.LTime)
  126. // Ignore the table events for networks that are in the process of going away
  127. nDB.RLock()
  128. networks := nDB.networks[nDB.config.NodeID]
  129. network, ok := networks[tEvent.NetworkID]
  130. // Check if the owner of the event is still part of the network
  131. nodes := nDB.networkNodes[tEvent.NetworkID]
  132. var nodePresent bool
  133. for _, node := range nodes {
  134. if node == tEvent.NodeName {
  135. nodePresent = true
  136. break
  137. }
  138. }
  139. nDB.RUnlock()
  140. if !ok || network.leaving || !nodePresent {
  141. // I'm out of the network OR the event owner is not anymore part of the network so do not propagate
  142. return false
  143. }
  144. e, err := nDB.getEntry(tEvent.TableName, tEvent.NetworkID, tEvent.Key)
  145. if err == nil {
  146. // We have the latest state. Ignore the event
  147. // since it is stale.
  148. if e.ltime >= tEvent.LTime {
  149. return false
  150. }
  151. }
  152. e = &entry{
  153. ltime: tEvent.LTime,
  154. node: tEvent.NodeName,
  155. value: tEvent.Value,
  156. deleting: tEvent.Type == TableEventTypeDelete,
  157. reapTime: time.Duration(tEvent.ResidualReapTime) * time.Second,
  158. }
  159. // All the entries marked for deletion should have a reapTime set greater than 0
  160. // This case can happen if the cluster is running different versions of the engine where the old version does not have the
  161. // field. If that is not the case, this can be a BUG
  162. if e.deleting && e.reapTime == 0 {
  163. logrus.Warnf("%v(%v) handleTableEvent object %+v has a 0 reapTime, is the cluster running the same docker engine version?",
  164. nDB.config.Hostname, nDB.config.NodeID, tEvent)
  165. e.reapTime = nDB.config.reapEntryInterval
  166. }
  167. nDB.Lock()
  168. nDB.createOrUpdateEntry(tEvent.NetworkID, tEvent.TableName, tEvent.Key, e)
  169. nDB.Unlock()
  170. if err != nil && tEvent.Type == TableEventTypeDelete {
  171. // If it is a delete event and we did not have a state for it, don't propagate to the application
  172. // If the residual reapTime is lower or equal to 1/6 of the total reapTime don't bother broadcasting it around
  173. // most likely the cluster is already aware of it, if not who will sync with this node will catch the state too.
  174. // This also avoids that deletion of entries close to their garbage collection ends up circuling around forever
  175. return e.reapTime > nDB.config.reapEntryInterval/6
  176. }
  177. var op opType
  178. switch tEvent.Type {
  179. case TableEventTypeCreate:
  180. op = opCreate
  181. case TableEventTypeUpdate:
  182. op = opUpdate
  183. case TableEventTypeDelete:
  184. op = opDelete
  185. }
  186. nDB.broadcaster.Write(makeEvent(op, tEvent.TableName, tEvent.NetworkID, tEvent.Key, tEvent.Value))
  187. return true
  188. }
  189. func (nDB *NetworkDB) handleCompound(buf []byte, isBulkSync bool) {
  190. // Decode the parts
  191. parts, err := decodeCompoundMessage(buf)
  192. if err != nil {
  193. logrus.Errorf("Failed to decode compound request: %v", err)
  194. return
  195. }
  196. // Handle each message
  197. for _, part := range parts {
  198. nDB.handleMessage(part, isBulkSync)
  199. }
  200. }
  201. func (nDB *NetworkDB) handleTableMessage(buf []byte, isBulkSync bool) {
  202. var tEvent TableEvent
  203. if err := proto.Unmarshal(buf, &tEvent); err != nil {
  204. logrus.Errorf("Error decoding table event message: %v", err)
  205. return
  206. }
  207. // Ignore messages that this node generated.
  208. if tEvent.NodeName == nDB.config.NodeID {
  209. return
  210. }
  211. if rebroadcast := nDB.handleTableEvent(&tEvent); rebroadcast {
  212. var err error
  213. buf, err = encodeRawMessage(MessageTypeTableEvent, buf)
  214. if err != nil {
  215. logrus.Errorf("Error marshalling gossip message for network event rebroadcast: %v", err)
  216. return
  217. }
  218. nDB.RLock()
  219. n, ok := nDB.networks[nDB.config.NodeID][tEvent.NetworkID]
  220. nDB.RUnlock()
  221. // if the network is not there anymore, OR we are leaving the network OR the broadcast queue is not present
  222. if !ok || n.leaving || n.tableBroadcasts == nil {
  223. return
  224. }
  225. n.tableBroadcasts.QueueBroadcast(&tableEventMessage{
  226. msg: buf,
  227. id: tEvent.NetworkID,
  228. tname: tEvent.TableName,
  229. key: tEvent.Key,
  230. node: tEvent.NodeName,
  231. })
  232. }
  233. }
  234. func (nDB *NetworkDB) handleNodeMessage(buf []byte) {
  235. var nEvent NodeEvent
  236. if err := proto.Unmarshal(buf, &nEvent); err != nil {
  237. logrus.Errorf("Error decoding node event message: %v", err)
  238. return
  239. }
  240. if rebroadcast := nDB.handleNodeEvent(&nEvent); rebroadcast {
  241. var err error
  242. buf, err = encodeRawMessage(MessageTypeNodeEvent, buf)
  243. if err != nil {
  244. logrus.Errorf("Error marshalling gossip message for node event rebroadcast: %v", err)
  245. return
  246. }
  247. nDB.nodeBroadcasts.QueueBroadcast(&nodeEventMessage{
  248. msg: buf,
  249. })
  250. }
  251. }
  252. func (nDB *NetworkDB) handleNetworkMessage(buf []byte) {
  253. var nEvent NetworkEvent
  254. if err := proto.Unmarshal(buf, &nEvent); err != nil {
  255. logrus.Errorf("Error decoding network event message: %v", err)
  256. return
  257. }
  258. if rebroadcast := nDB.handleNetworkEvent(&nEvent); rebroadcast {
  259. var err error
  260. buf, err = encodeRawMessage(MessageTypeNetworkEvent, buf)
  261. if err != nil {
  262. logrus.Errorf("Error marshalling gossip message for network event rebroadcast: %v", err)
  263. return
  264. }
  265. nDB.networkBroadcasts.QueueBroadcast(&networkEventMessage{
  266. msg: buf,
  267. id: nEvent.NetworkID,
  268. node: nEvent.NodeName,
  269. })
  270. }
  271. }
  272. func (nDB *NetworkDB) handleBulkSync(buf []byte) {
  273. var bsm BulkSyncMessage
  274. if err := proto.Unmarshal(buf, &bsm); err != nil {
  275. logrus.Errorf("Error decoding bulk sync message: %v", err)
  276. return
  277. }
  278. if bsm.LTime > 0 {
  279. nDB.tableClock.Witness(bsm.LTime)
  280. }
  281. nDB.handleMessage(bsm.Payload, true)
  282. // Don't respond to a bulk sync which was not unsolicited
  283. if !bsm.Unsolicited {
  284. nDB.Lock()
  285. ch, ok := nDB.bulkSyncAckTbl[bsm.NodeName]
  286. if ok {
  287. close(ch)
  288. delete(nDB.bulkSyncAckTbl, bsm.NodeName)
  289. }
  290. nDB.Unlock()
  291. return
  292. }
  293. var nodeAddr net.IP
  294. nDB.RLock()
  295. if node, ok := nDB.nodes[bsm.NodeName]; ok {
  296. nodeAddr = node.Addr
  297. }
  298. nDB.RUnlock()
  299. if err := nDB.bulkSyncNode(bsm.Networks, bsm.NodeName, false); err != nil {
  300. logrus.Errorf("Error in responding to bulk sync from node %s: %v", nodeAddr, err)
  301. }
  302. }
  303. func (nDB *NetworkDB) handleMessage(buf []byte, isBulkSync bool) {
  304. mType, data, err := decodeMessage(buf)
  305. if err != nil {
  306. logrus.Errorf("Error decoding gossip message to get message type: %v", err)
  307. return
  308. }
  309. switch mType {
  310. case MessageTypeNodeEvent:
  311. nDB.handleNodeMessage(data)
  312. case MessageTypeNetworkEvent:
  313. nDB.handleNetworkMessage(data)
  314. case MessageTypeTableEvent:
  315. nDB.handleTableMessage(data, isBulkSync)
  316. case MessageTypeBulkSync:
  317. nDB.handleBulkSync(data)
  318. case MessageTypeCompound:
  319. nDB.handleCompound(data, isBulkSync)
  320. default:
  321. logrus.Errorf("%v(%v): unknown message type %d", nDB.config.Hostname, nDB.config.NodeID, mType)
  322. }
  323. }
  324. func (d *delegate) NotifyMsg(buf []byte) {
  325. if len(buf) == 0 {
  326. return
  327. }
  328. d.nDB.handleMessage(buf, false)
  329. }
  330. func (d *delegate) GetBroadcasts(overhead, limit int) [][]byte {
  331. msgs := d.nDB.networkBroadcasts.GetBroadcasts(overhead, limit)
  332. msgs = append(msgs, d.nDB.nodeBroadcasts.GetBroadcasts(overhead, limit)...)
  333. return msgs
  334. }
  335. func (d *delegate) LocalState(join bool) []byte {
  336. if join {
  337. // Update all the local node/network state to a new time to
  338. // force update on the node we are trying to rejoin, just in
  339. // case that node has these in leaving state still. This is
  340. // facilitate fast convergence after recovering from a gossip
  341. // failure.
  342. d.nDB.updateLocalNetworkTime()
  343. }
  344. d.nDB.RLock()
  345. defer d.nDB.RUnlock()
  346. pp := NetworkPushPull{
  347. LTime: d.nDB.networkClock.Time(),
  348. NodeName: d.nDB.config.NodeID,
  349. }
  350. for name, nn := range d.nDB.networks {
  351. for _, n := range nn {
  352. pp.Networks = append(pp.Networks, &NetworkEntry{
  353. LTime: n.ltime,
  354. NetworkID: n.id,
  355. NodeName: name,
  356. Leaving: n.leaving,
  357. })
  358. }
  359. }
  360. buf, err := encodeMessage(MessageTypePushPull, &pp)
  361. if err != nil {
  362. logrus.Errorf("Failed to encode local network state: %v", err)
  363. return nil
  364. }
  365. return buf
  366. }
  367. func (d *delegate) MergeRemoteState(buf []byte, isJoin bool) {
  368. if len(buf) == 0 {
  369. logrus.Error("zero byte remote network state received")
  370. return
  371. }
  372. var gMsg GossipMessage
  373. err := proto.Unmarshal(buf, &gMsg)
  374. if err != nil {
  375. logrus.Errorf("Error unmarshalling push pull message: %v", err)
  376. return
  377. }
  378. if gMsg.Type != MessageTypePushPull {
  379. logrus.Errorf("Invalid message type %v received from remote", buf[0])
  380. }
  381. pp := NetworkPushPull{}
  382. if err := proto.Unmarshal(gMsg.Data, &pp); err != nil {
  383. logrus.Errorf("Failed to decode remote network state: %v", err)
  384. return
  385. }
  386. nodeEvent := &NodeEvent{
  387. LTime: pp.LTime,
  388. NodeName: pp.NodeName,
  389. Type: NodeEventTypeJoin,
  390. }
  391. d.nDB.handleNodeEvent(nodeEvent)
  392. for _, n := range pp.Networks {
  393. nEvent := &NetworkEvent{
  394. LTime: n.LTime,
  395. NodeName: n.NodeName,
  396. NetworkID: n.NetworkID,
  397. Type: NetworkEventTypeJoin,
  398. }
  399. if n.Leaving {
  400. nEvent.Type = NetworkEventTypeLeave
  401. }
  402. d.nDB.handleNetworkEvent(nEvent)
  403. }
  404. }