Merge pull request #1486 from sanimej/reap
Reap failed nodes after 24 hours
This commit is contained in:
commit
18c1727d1e
3 changed files with 23 additions and 3 deletions
|
@ -16,9 +16,11 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
reapInterval = 60 * time.Second
|
reapInterval = 60 * time.Second
|
||||||
reapPeriod = 5 * time.Second
|
reapPeriod = 5 * time.Second
|
||||||
retryInterval = 1 * time.Second
|
retryInterval = 1 * time.Second
|
||||||
|
nodeReapInterval = 24 * time.Hour
|
||||||
|
nodeReapPeriod = 2 * time.Hour
|
||||||
)
|
)
|
||||||
|
|
||||||
type logWriter struct{}
|
type logWriter struct{}
|
||||||
|
@ -147,6 +149,7 @@ func (nDB *NetworkDB) clusterInit() error {
|
||||||
{config.GossipInterval, nDB.gossip},
|
{config.GossipInterval, nDB.gossip},
|
||||||
{config.PushPullInterval, nDB.bulkSyncTables},
|
{config.PushPullInterval, nDB.bulkSyncTables},
|
||||||
{retryInterval, nDB.reconnectNode},
|
{retryInterval, nDB.reconnectNode},
|
||||||
|
{nodeReapPeriod, nDB.reapDeadNode},
|
||||||
} {
|
} {
|
||||||
t := time.NewTicker(trigger.interval)
|
t := time.NewTicker(trigger.interval)
|
||||||
go nDB.triggerFunc(trigger.interval, t.C, nDB.stopCh, trigger.fn)
|
go nDB.triggerFunc(trigger.interval, t.C, nDB.stopCh, trigger.fn)
|
||||||
|
@ -234,6 +237,19 @@ func (nDB *NetworkDB) triggerFunc(stagger time.Duration, C <-chan time.Time, sto
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (nDB *NetworkDB) reapDeadNode() {
|
||||||
|
nDB.Lock()
|
||||||
|
defer nDB.Unlock()
|
||||||
|
for id, n := range nDB.failedNodes {
|
||||||
|
if n.reapTime > 0 {
|
||||||
|
n.reapTime -= reapPeriod
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
logrus.Debugf("Removing failed node %v from gossip cluster", n.Name)
|
||||||
|
delete(nDB.failedNodes, id)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (nDB *NetworkDB) reconnectNode() {
|
func (nDB *NetworkDB) reconnectNode() {
|
||||||
nDB.RLock()
|
nDB.RLock()
|
||||||
if len(nDB.failedNodes) == 0 {
|
if len(nDB.failedNodes) == 0 {
|
||||||
|
|
|
@ -29,6 +29,8 @@ func (e *eventDelegate) NotifyLeave(mn *memberlist.Node) {
|
||||||
e.nDB.Lock()
|
e.nDB.Lock()
|
||||||
if n, ok := e.nDB.nodes[mn.Name]; ok {
|
if n, ok := e.nDB.nodes[mn.Name]; ok {
|
||||||
delete(e.nDB.nodes, mn.Name)
|
delete(e.nDB.nodes, mn.Name)
|
||||||
|
|
||||||
|
n.reapTime = reapInterval
|
||||||
e.nDB.failedNodes[mn.Name] = n
|
e.nDB.failedNodes[mn.Name] = n
|
||||||
}
|
}
|
||||||
e.nDB.Unlock()
|
e.nDB.Unlock()
|
||||||
|
|
|
@ -94,6 +94,8 @@ type NetworkDB struct {
|
||||||
type node struct {
|
type node struct {
|
||||||
memberlist.Node
|
memberlist.Node
|
||||||
ltime serf.LamportTime
|
ltime serf.LamportTime
|
||||||
|
// Number of hours left before the reaper removes the node
|
||||||
|
reapTime time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
// network describes the node/network attachment.
|
// network describes the node/network attachment.
|
||||||
|
|
Loading…
Add table
Reference in a new issue