Reap failed nodes after 24 hours

Signed-off-by: Santhosh Manohar <santhosh@docker.com>
This commit is contained in:
Santhosh Manohar 2016-09-30 14:03:10 -07:00
parent 6b74a8d479
commit e98b152bac
3 changed files with 23 additions and 3 deletions

View file

@ -16,9 +16,11 @@ import (
)
const (
reapInterval = 60 * time.Second
reapPeriod = 5 * time.Second
retryInterval = 1 * time.Second
reapInterval = 60 * time.Second
reapPeriod = 5 * time.Second
retryInterval = 1 * time.Second
nodeReapInterval = 24 * time.Hour
nodeReapPeriod = 2 * time.Hour
)
type logWriter struct{}
@ -147,6 +149,7 @@ func (nDB *NetworkDB) clusterInit() error {
{config.GossipInterval, nDB.gossip},
{config.PushPullInterval, nDB.bulkSyncTables},
{retryInterval, nDB.reconnectNode},
{nodeReapPeriod, nDB.reapDeadNode},
} {
t := time.NewTicker(trigger.interval)
go nDB.triggerFunc(trigger.interval, t.C, nDB.stopCh, trigger.fn)
@ -234,6 +237,19 @@ func (nDB *NetworkDB) triggerFunc(stagger time.Duration, C <-chan time.Time, sto
}
}
func (nDB *NetworkDB) reapDeadNode() {
nDB.Lock()
defer nDB.Unlock()
for id, n := range nDB.failedNodes {
if n.reapTime > 0 {
n.reapTime -= reapPeriod
continue
}
logrus.Debugf("Removing failed node %v from gossip cluster", n.Name)
delete(nDB.failedNodes, id)
}
}
func (nDB *NetworkDB) reconnectNode() {
nDB.RLock()
if len(nDB.failedNodes) == 0 {

View file

@ -29,6 +29,8 @@ func (e *eventDelegate) NotifyLeave(mn *memberlist.Node) {
e.nDB.Lock()
if n, ok := e.nDB.nodes[mn.Name]; ok {
delete(e.nDB.nodes, mn.Name)
n.reapTime = reapInterval
e.nDB.failedNodes[mn.Name] = n
}
e.nDB.Unlock()

View file

@ -94,6 +94,8 @@ type NetworkDB struct {
type node struct {
memberlist.Node
ltime serf.LamportTime
// Number of hours left before the reaper removes the node
reapTime time.Duration
}
// network describes the node/network attachment.