diff --git a/state.go b/state.go index 81c3bceca..5a8623d3e 100644 --- a/state.go +++ b/state.go @@ -1074,6 +1074,25 @@ func (m *Memberlist) aliveNode(a *alive, notify chan struct{}, bootstrap bool) { // Bail if the incarnation number is older, and this is not about us isLocalNode := state.Name == m.config.Name if a.Incarnation <= state.Incarnation && !isLocalNode && !updatesNode { + // If we believe this node is dead or suspect, re-gossip that state so + // the node — which may have restarted with incarnation=1 — can receive + // the accusation and refute it. Without this, a stale dead/suspect + // message with a high incarnation number can permanently "orphan" in + // the cluster: the restarted node broadcasts alive(inc=1), but no node + // that receives it forwards the dead(inc=100) back to it, so the node + // never gets the chance to call refute(). + switch state.State { + case StateDead: + m.logger.Printf("[WARN] memberlist: Received stale alive for dead node %s (alive-inc: %d <= dead-inc: %d); re-gossiping dead msg to help node refute", + a.Node, a.Incarnation, state.Incarnation) + d := dead{Incarnation: state.Incarnation, Node: state.Name, From: m.config.Name} + m.encodeAndBroadcast(state.Name, deadMsg, d) + case StateSuspect: + m.logger.Printf("[WARN] memberlist: Received stale alive for suspect node %s (alive-inc: %d <= suspect-inc: %d); re-gossiping suspect msg to help node refute", + a.Node, a.Incarnation, state.Incarnation) + s := suspect{Incarnation: state.Incarnation, Node: state.Name, From: m.config.Name} + m.encodeAndBroadcast(state.Name, suspectMsg, s) + } return } @@ -1162,8 +1181,14 @@ func (m *Memberlist) suspectNode(s *suspect) { defer m.nodeLock.Unlock() state, ok := m.nodeMap[s.Node] - // If we've never heard about this node before, ignore it + // If we've never heard about this node before, forward the suspect message + // anyway so it can propagate through nodes that may know the target. During + // mass restarts, freshly joined nodes have incomplete cluster views and + // silently dropping the message here creates a gossip black hole. if !ok { + m.logger.Printf("[WARN] memberlist: Forwarding suspect msg for unknown node %s (inc: %d, from: %s)", + s.Node, s.Incarnation, s.From) + m.encodeAndBroadcast(s.Node, suspectMsg, s) return } @@ -1203,6 +1228,7 @@ func (m *Memberlist) suspectNode(s *suspect) { // Update the state state.Incarnation = s.Incarnation state.State = StateSuspect + m.logger.Printf("[INFO] memberlist: Marking %s as suspect (incarnation: %d, from: %s)", s.Node, s.Incarnation, s.From) changeTime := time.Now() state.StateChange = changeTime @@ -1255,8 +1281,15 @@ func (m *Memberlist) deadNode(d *dead) { defer m.nodeLock.Unlock() state, ok := m.nodeMap[d.Node] - // If we've never heard about this node before, ignore it + // If we've never heard about this node before, forward the dead message + // anyway so it can propagate through nodes that do know the target. During + // mass restarts, freshly joined nodes have incomplete cluster views and + // silently dropping the message here creates a gossip black hole that + // prevents the accused node from ever receiving the accusation and refuting. if !ok { + m.logger.Printf("[WARN] memberlist: Forwarding dead msg for unknown node %s (inc: %d, from: %s)", + d.Node, d.Incarnation, d.From) + m.encodeAndBroadcast(d.Node, deadMsg, d) return } @@ -1298,8 +1331,10 @@ func (m *Memberlist) deadNode(d *dead) { // instead of dead. if d.Node == d.From { state.State = StateLeft + m.logger.Printf("[INFO] memberlist: %s has left the cluster (incarnation: %d)", d.Node, d.Incarnation) } else { state.State = StateDead + m.logger.Printf("[INFO] memberlist: Marking %s as dead (incarnation: %d, from: %s)", d.Node, d.Incarnation, d.From) } state.StateChange = time.Now() diff --git a/state_test.go b/state_test.go index 0b177fc54..7b27b6781 100644 --- a/state_test.go +++ b/state_test.go @@ -2664,3 +2664,190 @@ func verifySampleExists(t *testing.T, name string, sink *metrics.InmemSink) { t.Fatalf("%s sample not emmited", name) } } + +// --------------------------------------------------------------------------- +// Tests for the mass-restart incarnation race condition fix +// +// Scenario: A node is declared dead at a high incarnation (e.g. 100). The +// node then restarts and re-announces itself at incarnation 1. Two separate +// bugs prevented the dead accusation from ever reaching the restarted node: +// +// Bug 1 (aliveNode): a stale alive(inc=1) arriving at a node that already +// holds dead(inc=100) was silently dropped — the dead message was never +// re-gossiped, so the restarted node never learned it needed to refute. +// +// Bug 2 (deadNode / suspectNode): a dead/suspect message arriving at a node +// that has never heard of the target was silently dropped instead of being +// forwarded, creating a gossip black hole in freshly-joined nodes. +// --------------------------------------------------------------------------- + +// TestMemberList_AliveNode_ReGossipsDeadAccusation verifies Bug 1 fix for the +// dead state: receiving a stale alive message must re-queue the dead accusation +// so the restarted node can learn about it and call refute(). +func TestMemberList_AliveNode_ReGossipsDeadAccusation(t *testing.T) { + m := GetMemberlist(t, nil) + defer func() { + if err := m.Shutdown(); err != nil { + t.Fatal(err) + } + }() + + // Establish the node at the high pre-restart incarnation (100). + a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 100, Vsn: m.config.BuildVsnArray()} + m.aliveNode(&a, nil, false) + + // Another node marks it dead at incarnation 100 — this is the stale + // accusation that will keep floating around after the restart. + d := dead{Node: "test", From: "accuser", Incarnation: 100} + m.deadNode(&d) + + if m.getNodeState("test") != StateDead { + t.Fatalf("expected node to be dead after setup") + } + + // Drain setup broadcasts so assertions below are clean. + m.broadcasts.Reset() + + // The restarted node announces alive(inc=1) — incarnation reset to 1 + // because the in-memory counter was wiped on restart. + staleAlive := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1, Vsn: m.config.BuildVsnArray()} + m.aliveNode(&staleAlive, nil, false) + + // The node must remain dead — a stale alive(inc=1) must not revive a + // node that is dead at inc=100. + if m.getNodeState("test") != StateDead { + t.Fatalf("stale alive(inc=1) must not revive a dead(inc=100) node") + } + + // The dead accusation must have been re-queued for gossip so that the + // restarted node can eventually receive it and call refute(). + if m.broadcasts.NumQueued() != 1 { + t.Fatalf("expected dead accusation to be re-gossiped: got %d queued broadcasts", m.broadcasts.NumQueued()) + } + + // Confirm the re-queued broadcast is a dead message, not alive or suspect. + if messageType(m.broadcasts.orderedView(true)[0].b.Message()[0]) != deadMsg { + t.Fatalf("re-gossiped broadcast must be a dead message") + } +} + +// TestMemberList_AliveNode_ReGossipsSuspectAccusation verifies Bug 1 fix for +// the suspect state: receiving a stale alive message must re-queue the suspect +// accusation so the restarted node can refute before the suspicion timer fires. +func TestMemberList_AliveNode_ReGossipsSuspectAccusation(t *testing.T) { + m := GetMemberlist(t, func(c *Config) { + // Use a very long probe interval so the suspect-to-dead timer does + // not fire and add a second broadcast during the assertion window. + c.ProbeInterval = time.Hour + c.SuspicionMult = 5 + }) + defer func() { + if err := m.Shutdown(); err != nil { + t.Fatal(err) + } + }() + + // Establish the node at the high pre-restart incarnation (100). + a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 100, Vsn: m.config.BuildVsnArray()} + m.aliveNode(&a, nil, false) + + // Mark it suspect at incarnation 100. + s := suspect{Node: "test", From: "accuser", Incarnation: 100} + m.suspectNode(&s) + + if m.getNodeState("test") != StateSuspect { + t.Fatalf("expected node to be suspect after setup") + } + + // Drain setup broadcasts. + m.broadcasts.Reset() + + // The restarted node announces alive(inc=1). + staleAlive := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1, Vsn: m.config.BuildVsnArray()} + m.aliveNode(&staleAlive, nil, false) + + // Suspect state must be preserved — stale alive(inc=1) must not clear it. + if m.getNodeState("test") != StateSuspect { + t.Fatalf("stale alive(inc=1) must not clear suspect(inc=100) state") + } + + // The suspect accusation must be re-queued so the restarted node can + // receive it and refute before the suspicion timer declares it dead. + if m.broadcasts.NumQueued() != 1 { + t.Fatalf("expected suspect accusation to be re-gossiped: got %d queued broadcasts", m.broadcasts.NumQueued()) + } + + if messageType(m.broadcasts.orderedView(true)[0].b.Message()[0]) != suspectMsg { + t.Fatalf("re-gossiped broadcast must be a suspect message") + } +} + +// TestMemberList_DeadNode_UnknownNode_ForwardsMessage verifies Bug 2 fix for +// dead messages: a dead message about a node we have never heard of must be +// forwarded via gossip rather than silently dropped, so the message can +// propagate through nodes with incomplete cluster views during mass restarts. +func TestMemberList_DeadNode_UnknownNode_ForwardsMessage(t *testing.T) { + m := GetMemberlist(t, nil) + defer func() { + if err := m.Shutdown(); err != nil { + t.Fatal(err) + } + }() + + // Sanity: no nodes registered before the call. + if len(m.nodes) != 0 { + t.Fatalf("expected empty node list at start, got %d", len(m.nodes)) + } + + d := dead{Node: "unknown", From: "accuser", Incarnation: 100} + m.deadNode(&d) + + // The unknown node must NOT be added to the node map — we are only + // forwarding the message, not adopting membership state for it. + if len(m.nodes) != 0 { + t.Fatalf("dead msg for unknown node must not create a node entry") + } + + // The dead message must be queued for re-broadcast so it can reach nodes + // that do know the target (including the target itself if it is alive). + if m.broadcasts.NumQueued() != 1 { + t.Fatalf("expected dead message to be forwarded: got %d queued broadcasts", m.broadcasts.NumQueued()) + } + + if messageType(m.broadcasts.orderedView(true)[0].b.Message()[0]) != deadMsg { + t.Fatalf("forwarded broadcast must be a dead message") + } +} + +// TestMemberList_SuspectNode_UnknownNode_ForwardsMessage verifies Bug 2 fix +// for suspect messages: a suspect message about an unknown node must be +// forwarded rather than dropped, for the same reason as the dead-message case. +func TestMemberList_SuspectNode_UnknownNode_ForwardsMessage(t *testing.T) { + m := GetMemberlist(t, nil) + defer func() { + if err := m.Shutdown(); err != nil { + t.Fatal(err) + } + }() + + if len(m.nodes) != 0 { + t.Fatalf("expected empty node list at start, got %d", len(m.nodes)) + } + + s := suspect{Node: "unknown", From: "accuser", Incarnation: 100} + m.suspectNode(&s) + + // No node entry must be created for the unknown node. + if len(m.nodes) != 0 { + t.Fatalf("suspect msg for unknown node must not create a node entry") + } + + // Suspect message must be forwarded via gossip. + if m.broadcasts.NumQueued() != 1 { + t.Fatalf("expected suspect message to be forwarded: got %d queued broadcasts", m.broadcasts.NumQueued()) + } + + if messageType(m.broadcasts.orderedView(true)[0].b.Message()[0]) != suspectMsg { + t.Fatalf("forwarded broadcast must be a suspect message") + } +}