|
@@ -98,10 +98,14 @@ func (nDB *NetworkDB) RemoveKey(key []byte) {
|
|
|
}
|
|
|
|
|
|
func (nDB *NetworkDB) clusterInit() error {
|
|
|
+ nDB.lastStatsTimestamp = time.Now()
|
|
|
+ nDB.lastHealthTimestamp = nDB.lastStatsTimestamp
|
|
|
+
|
|
|
config := memberlist.DefaultLANConfig()
|
|
|
config.Name = nDB.config.NodeName
|
|
|
config.BindAddr = nDB.config.BindAddr
|
|
|
config.AdvertiseAddr = nDB.config.AdvertiseAddr
|
|
|
+ config.UDPBufferSize = nDB.config.PacketBufferSize
|
|
|
|
|
|
if nDB.config.BindPort != 0 {
|
|
|
config.BindPort = nDB.config.BindPort
|
|
@@ -199,9 +203,8 @@ func (nDB *NetworkDB) clusterJoin(members []string) error {
|
|
|
mlist := nDB.memberlist
|
|
|
|
|
|
if _, err := mlist.Join(members); err != nil {
|
|
|
- // Incase of failure, keep retrying join until it succeeds or the cluster is shutdown.
|
|
|
+ // In case of failure, keep retrying join until it succeeds or the cluster is shutdown.
|
|
|
go nDB.retryJoin(members, nDB.stopCh)
|
|
|
-
|
|
|
return fmt.Errorf("could not join node to memberlist: %v", err)
|
|
|
}
|
|
|
|
|
@@ -372,11 +375,21 @@ func (nDB *NetworkDB) gossip() {
|
|
|
networkNodes[nid] = nDB.networkNodes[nid]
|
|
|
|
|
|
}
|
|
|
+ printStats := time.Since(nDB.lastStatsTimestamp) >= nDB.config.StatsPrintPeriod
|
|
|
+ printHealth := time.Since(nDB.lastHealthTimestamp) >= nDB.config.HealthPrintPeriod
|
|
|
nDB.RUnlock()
|
|
|
|
|
|
+ if printHealth {
|
|
|
+ healthScore := nDB.memberlist.GetHealthScore()
|
|
|
+ if healthScore != 0 {
|
|
|
+ logrus.Warnf("NetworkDB stats - healthscore:%d (connectivity issues)", healthScore)
|
|
|
+ }
|
|
|
+ nDB.lastHealthTimestamp = time.Now()
|
|
|
+ }
|
|
|
+
|
|
|
for nid, nodes := range networkNodes {
|
|
|
mNodes := nDB.mRandomNodes(3, nodes)
|
|
|
- bytesAvail := udpSendBuf - compoundHeaderOverhead
|
|
|
+ bytesAvail := nDB.config.PacketBufferSize - compoundHeaderOverhead
|
|
|
|
|
|
nDB.RLock()
|
|
|
network, ok := thisNodeNetworks[nid]
|
|
@@ -397,6 +410,14 @@ func (nDB *NetworkDB) gossip() {
|
|
|
}
|
|
|
|
|
|
msgs := broadcastQ.GetBroadcasts(compoundOverhead, bytesAvail)
|
|
|
+ // Collect stats and print the queue info, note this code is here also to have a view of the queues empty
|
|
|
+ network.qMessagesSent += len(msgs)
|
|
|
+ if printStats {
|
|
|
+ logrus.Infof("NetworkDB stats - Queue net:%s qLen:%d netPeers:%d netMsg/s:%d",
|
|
|
+ nid, broadcastQ.NumQueued(), broadcastQ.NumNodes(), network.qMessagesSent/int((nDB.config.StatsPrintPeriod/time.Second)))
|
|
|
+ network.qMessagesSent = 0
|
|
|
+ }
|
|
|
+
|
|
|
if len(msgs) == 0 {
|
|
|
continue
|
|
|
}
|
|
@@ -414,11 +435,15 @@ func (nDB *NetworkDB) gossip() {
|
|
|
}
|
|
|
|
|
|
// Send the compound message
|
|
|
- if err := nDB.memberlist.SendToUDP(&mnode.Node, compound); err != nil {
|
|
|
+ if err := nDB.memberlist.SendBestEffort(&mnode.Node, compound); err != nil {
|
|
|
logrus.Errorf("Failed to send gossip to %s: %s", mnode.Addr, err)
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+ // Reset the stats
|
|
|
+ if printStats {
|
|
|
+ nDB.lastStatsTimestamp = time.Now()
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
func (nDB *NetworkDB) bulkSyncTables() {
|
|
@@ -589,7 +614,7 @@ func (nDB *NetworkDB) bulkSyncNode(networks []string, node string, unsolicited b
|
|
|
nDB.bulkSyncAckTbl[node] = ch
|
|
|
nDB.Unlock()
|
|
|
|
|
|
- err = nDB.memberlist.SendToTCP(&mnode.Node, buf)
|
|
|
+ err = nDB.memberlist.SendReliable(&mnode.Node, buf)
|
|
|
if err != nil {
|
|
|
nDB.Lock()
|
|
|
delete(nDB.bulkSyncAckTbl, node)
|
|
@@ -606,7 +631,7 @@ func (nDB *NetworkDB) bulkSyncNode(networks []string, node string, unsolicited b
|
|
|
case <-t.C:
|
|
|
logrus.Errorf("Bulk sync to node %s timed out", node)
|
|
|
case <-ch:
|
|
|
- logrus.Debugf("%s: Bulk sync to node %s took %s", nDB.config.NodeName, node, time.Now().Sub(startTime))
|
|
|
+ logrus.Debugf("%s: Bulk sync to node %s took %s", nDB.config.NodeName, node, time.Since(startTime))
|
|
|
}
|
|
|
t.Stop()
|
|
|
}
|