-
Notifications
You must be signed in to change notification settings - Fork 470
fix: re-gossip dead/suspect accusations on stale alive and unknown node #345
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1074,6 +1074,25 @@ func (m *Memberlist) aliveNode(a *alive, notify chan struct{}, bootstrap bool) { | |
| // Bail if the incarnation number is older, and this is not about us | ||
| isLocalNode := state.Name == m.config.Name | ||
| if a.Incarnation <= state.Incarnation && !isLocalNode && !updatesNode { | ||
| // If we believe this node is dead or suspect, re-gossip that state so | ||
| // the node — which may have restarted with incarnation=1 — can receive | ||
| // the accusation and refute it. Without this, a stale dead/suspect | ||
| // message with a high incarnation number can permanently "orphan" in | ||
| // the cluster: the restarted node broadcasts alive(inc=1), but no node | ||
| // that receives it forwards the dead(inc=100) back to it, so the node | ||
| // never gets the chance to call refute(). | ||
| switch state.State { | ||
| case StateDead: | ||
| m.logger.Printf("[WARN] memberlist: Received stale alive for dead node %s (alive-inc: %d <= dead-inc: %d); re-gossiping dead msg to help node refute", | ||
| a.Node, a.Incarnation, state.Incarnation) | ||
| d := dead{Incarnation: state.Incarnation, Node: state.Name, From: m.config.Name} | ||
| m.encodeAndBroadcast(state.Name, deadMsg, d) | ||
| case StateSuspect: | ||
| m.logger.Printf("[WARN] memberlist: Received stale alive for suspect node %s (alive-inc: %d <= suspect-inc: %d); re-gossiping suspect msg to help node refute", | ||
| a.Node, a.Incarnation, state.Incarnation) | ||
| s := suspect{Incarnation: state.Incarnation, Node: state.Name, From: m.config.Name} | ||
| m.encodeAndBroadcast(state.Name, suspectMsg, s) | ||
| } | ||
| return | ||
| } | ||
|
|
||
|
|
@@ -1162,8 +1181,14 @@ func (m *Memberlist) suspectNode(s *suspect) { | |
| defer m.nodeLock.Unlock() | ||
| state, ok := m.nodeMap[s.Node] | ||
|
|
||
| // If we've never heard about this node before, ignore it | ||
| // If we've never heard about this node before, forward the suspect message | ||
| // anyway so it can propagate through nodes that may know the target. During | ||
| // mass restarts, freshly joined nodes have incomplete cluster views and | ||
| // silently dropping the message here creates a gossip black hole. | ||
| if !ok { | ||
| m.logger.Printf("[WARN] memberlist: Forwarding suspect msg for unknown node %s (inc: %d, from: %s)", | ||
| s.Node, s.Incarnation, s.From) | ||
| m.encodeAndBroadcast(s.Node, suspectMsg, s) | ||
| return | ||
| } | ||
|
|
||
|
|
@@ -1203,6 +1228,7 @@ func (m *Memberlist) suspectNode(s *suspect) { | |
| // Update the state | ||
| state.Incarnation = s.Incarnation | ||
| state.State = StateSuspect | ||
| m.logger.Printf("[INFO] memberlist: Marking %s as suspect (incarnation: %d, from: %s)", s.Node, s.Incarnation, s.From) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These logs are going to be extremely noisy in real-world large clusters (ex 1000s of Consul nodes) and balloon operator costs. Let's remove all these new logs. |
||
| changeTime := time.Now() | ||
| state.StateChange = changeTime | ||
|
|
||
|
|
@@ -1255,8 +1281,15 @@ func (m *Memberlist) deadNode(d *dead) { | |
| defer m.nodeLock.Unlock() | ||
| state, ok := m.nodeMap[d.Node] | ||
|
|
||
| // If we've never heard about this node before, ignore it | ||
| // If we've never heard about this node before, forward the dead message | ||
| // anyway so it can propagate through nodes that do know the target. During | ||
| // mass restarts, freshly joined nodes have incomplete cluster views and | ||
| // silently dropping the message here creates a gossip black hole that | ||
| // prevents the accused node from ever receiving the accusation and refuting. | ||
| if !ok { | ||
| m.logger.Printf("[WARN] memberlist: Forwarding dead msg for unknown node %s (inc: %d, from: %s)", | ||
| d.Node, d.Incarnation, d.From) | ||
| m.encodeAndBroadcast(d.Node, deadMsg, d) | ||
| return | ||
| } | ||
|
|
||
|
|
@@ -1298,8 +1331,10 @@ func (m *Memberlist) deadNode(d *dead) { | |
| // instead of dead. | ||
| if d.Node == d.From { | ||
| state.State = StateLeft | ||
| m.logger.Printf("[INFO] memberlist: %s has left the cluster (incarnation: %d)", d.Node, d.Incarnation) | ||
| } else { | ||
| state.State = StateDead | ||
| m.logger.Printf("[INFO] memberlist: Marking %s as dead (incarnation: %d, from: %s)", d.Node, d.Incarnation, d.From) | ||
| } | ||
| state.StateChange = time.Now() | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2664,3 +2664,190 @@ func verifySampleExists(t *testing.T, name string, sink *metrics.InmemSink) { | |
| t.Fatalf("%s sample not emmited", name) | ||
| } | ||
| } | ||
|
|
||
| // --------------------------------------------------------------------------- | ||
| // Tests for the mass-restart incarnation race condition fix | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "race condition"? |
||
| // | ||
| // Scenario: A node is declared dead at a high incarnation (e.g. 100). The | ||
| // node then restarts and re-announces itself at incarnation 1. Two separate | ||
| // bugs prevented the dead accusation from ever reaching the restarted node: | ||
| // | ||
| // Bug 1 (aliveNode): a stale alive(inc=1) arriving at a node that already | ||
| // holds dead(inc=100) was silently dropped — the dead message was never | ||
| // re-gossiped, so the restarted node never learned it needed to refute. | ||
| // | ||
| // Bug 2 (deadNode / suspectNode): a dead/suspect message arriving at a node | ||
| // that has never heard of the target was silently dropped instead of being | ||
| // forwarded, creating a gossip black hole in freshly-joined nodes. | ||
| // --------------------------------------------------------------------------- | ||
|
Comment on lines
+2671
to
+2682
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Having entirely separate tests as you've done here undermines the reasoning for why you need both fixes. The tests exercise the low-level behavior you've explained in the PR description but not proven that you've fixed the actual user-facing beahvior we care about. |
||
|
|
||
| // TestMemberList_AliveNode_ReGossipsDeadAccusation verifies Bug 1 fix for the | ||
| // dead state: receiving a stale alive message must re-queue the dead accusation | ||
| // so the restarted node can learn about it and call refute(). | ||
| func TestMemberList_AliveNode_ReGossipsDeadAccusation(t *testing.T) { | ||
| m := GetMemberlist(t, nil) | ||
| defer func() { | ||
| if err := m.Shutdown(); err != nil { | ||
| t.Fatal(err) | ||
| } | ||
| }() | ||
|
|
||
| // Establish the node at the high pre-restart incarnation (100). | ||
| a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 100, Vsn: m.config.BuildVsnArray()} | ||
| m.aliveNode(&a, nil, false) | ||
|
|
||
| // Another node marks it dead at incarnation 100 — this is the stale | ||
| // accusation that will keep floating around after the restart. | ||
| d := dead{Node: "test", From: "accuser", Incarnation: 100} | ||
| m.deadNode(&d) | ||
|
|
||
| if m.getNodeState("test") != StateDead { | ||
| t.Fatalf("expected node to be dead after setup") | ||
| } | ||
|
|
||
| // Drain setup broadcasts so assertions below are clean. | ||
| m.broadcasts.Reset() | ||
|
|
||
| // The restarted node announces alive(inc=1) — incarnation reset to 1 | ||
| // because the in-memory counter was wiped on restart. | ||
| staleAlive := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1, Vsn: m.config.BuildVsnArray()} | ||
| m.aliveNode(&staleAlive, nil, false) | ||
|
|
||
| // The node must remain dead — a stale alive(inc=1) must not revive a | ||
| // node that is dead at inc=100. | ||
| if m.getNodeState("test") != StateDead { | ||
| t.Fatalf("stale alive(inc=1) must not revive a dead(inc=100) node") | ||
| } | ||
|
|
||
| // The dead accusation must have been re-queued for gossip so that the | ||
| // restarted node can eventually receive it and call refute(). | ||
| if m.broadcasts.NumQueued() != 1 { | ||
| t.Fatalf("expected dead accusation to be re-gossiped: got %d queued broadcasts", m.broadcasts.NumQueued()) | ||
| } | ||
|
|
||
| // Confirm the re-queued broadcast is a dead message, not alive or suspect. | ||
| if messageType(m.broadcasts.orderedView(true)[0].b.Message()[0]) != deadMsg { | ||
| t.Fatalf("re-gossiped broadcast must be a dead message") | ||
| } | ||
| } | ||
|
|
||
| // TestMemberList_AliveNode_ReGossipsSuspectAccusation verifies Bug 1 fix for | ||
| // the suspect state: receiving a stale alive message must re-queue the suspect | ||
| // accusation so the restarted node can refute before the suspicion timer fires. | ||
| func TestMemberList_AliveNode_ReGossipsSuspectAccusation(t *testing.T) { | ||
| m := GetMemberlist(t, func(c *Config) { | ||
| // Use a very long probe interval so the suspect-to-dead timer does | ||
| // not fire and add a second broadcast during the assertion window. | ||
| c.ProbeInterval = time.Hour | ||
| c.SuspicionMult = 5 | ||
| }) | ||
| defer func() { | ||
| if err := m.Shutdown(); err != nil { | ||
| t.Fatal(err) | ||
| } | ||
| }() | ||
|
|
||
| // Establish the node at the high pre-restart incarnation (100). | ||
| a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 100, Vsn: m.config.BuildVsnArray()} | ||
| m.aliveNode(&a, nil, false) | ||
|
|
||
| // Mark it suspect at incarnation 100. | ||
| s := suspect{Node: "test", From: "accuser", Incarnation: 100} | ||
| m.suspectNode(&s) | ||
|
|
||
| if m.getNodeState("test") != StateSuspect { | ||
| t.Fatalf("expected node to be suspect after setup") | ||
| } | ||
|
|
||
| // Drain setup broadcasts. | ||
| m.broadcasts.Reset() | ||
|
|
||
| // The restarted node announces alive(inc=1). | ||
| staleAlive := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1, Vsn: m.config.BuildVsnArray()} | ||
| m.aliveNode(&staleAlive, nil, false) | ||
|
|
||
| // Suspect state must be preserved — stale alive(inc=1) must not clear it. | ||
| if m.getNodeState("test") != StateSuspect { | ||
| t.Fatalf("stale alive(inc=1) must not clear suspect(inc=100) state") | ||
| } | ||
|
|
||
| // The suspect accusation must be re-queued so the restarted node can | ||
| // receive it and refute before the suspicion timer declares it dead. | ||
| if m.broadcasts.NumQueued() != 1 { | ||
| t.Fatalf("expected suspect accusation to be re-gossiped: got %d queued broadcasts", m.broadcasts.NumQueued()) | ||
| } | ||
|
|
||
| if messageType(m.broadcasts.orderedView(true)[0].b.Message()[0]) != suspectMsg { | ||
| t.Fatalf("re-gossiped broadcast must be a suspect message") | ||
| } | ||
| } | ||
|
|
||
| // TestMemberList_DeadNode_UnknownNode_ForwardsMessage verifies Bug 2 fix for | ||
| // dead messages: a dead message about a node we have never heard of must be | ||
| // forwarded via gossip rather than silently dropped, so the message can | ||
| // propagate through nodes with incomplete cluster views during mass restarts. | ||
| func TestMemberList_DeadNode_UnknownNode_ForwardsMessage(t *testing.T) { | ||
| m := GetMemberlist(t, nil) | ||
| defer func() { | ||
| if err := m.Shutdown(); err != nil { | ||
| t.Fatal(err) | ||
| } | ||
| }() | ||
|
|
||
| // Sanity: no nodes registered before the call. | ||
| if len(m.nodes) != 0 { | ||
| t.Fatalf("expected empty node list at start, got %d", len(m.nodes)) | ||
| } | ||
|
|
||
| d := dead{Node: "unknown", From: "accuser", Incarnation: 100} | ||
| m.deadNode(&d) | ||
|
|
||
| // The unknown node must NOT be added to the node map — we are only | ||
| // forwarding the message, not adopting membership state for it. | ||
| if len(m.nodes) != 0 { | ||
| t.Fatalf("dead msg for unknown node must not create a node entry") | ||
| } | ||
|
|
||
| // The dead message must be queued for re-broadcast so it can reach nodes | ||
| // that do know the target (including the target itself if it is alive). | ||
| if m.broadcasts.NumQueued() != 1 { | ||
| t.Fatalf("expected dead message to be forwarded: got %d queued broadcasts", m.broadcasts.NumQueued()) | ||
| } | ||
|
|
||
| if messageType(m.broadcasts.orderedView(true)[0].b.Message()[0]) != deadMsg { | ||
| t.Fatalf("forwarded broadcast must be a dead message") | ||
| } | ||
| } | ||
|
|
||
| // TestMemberList_SuspectNode_UnknownNode_ForwardsMessage verifies Bug 2 fix | ||
| // for suspect messages: a suspect message about an unknown node must be | ||
| // forwarded rather than dropped, for the same reason as the dead-message case. | ||
| func TestMemberList_SuspectNode_UnknownNode_ForwardsMessage(t *testing.T) { | ||
| m := GetMemberlist(t, nil) | ||
| defer func() { | ||
| if err := m.Shutdown(); err != nil { | ||
| t.Fatal(err) | ||
| } | ||
| }() | ||
|
|
||
| if len(m.nodes) != 0 { | ||
| t.Fatalf("expected empty node list at start, got %d", len(m.nodes)) | ||
| } | ||
|
|
||
| s := suspect{Node: "unknown", From: "accuser", Incarnation: 100} | ||
| m.suspectNode(&s) | ||
|
|
||
| // No node entry must be created for the unknown node. | ||
| if len(m.nodes) != 0 { | ||
| t.Fatalf("suspect msg for unknown node must not create a node entry") | ||
| } | ||
|
|
||
| // Suspect message must be forwarded via gossip. | ||
| if m.broadcasts.NumQueued() != 1 { | ||
| t.Fatalf("expected suspect message to be forwarded: got %d queued broadcasts", m.broadcasts.NumQueued()) | ||
| } | ||
|
|
||
| if messageType(m.broadcasts.orderedView(true)[0].b.Message()[0]) != suspectMsg { | ||
| t.Fatalf("forwarded broadcast must be a suspect message") | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's not clear to me why this part is needed, and despite the retransmit limit this seems like it's going to increase cluster traffic unnecessarily when a node has actually left permanently.
In the SWIM protocol the Confirm message ("Refute" in this library) overrides all Alive or Suspect messages regardless of incarnation. That's why you made the fix in
aliveNode(): the restarted node is going to broadcast its initial alive message with incarnation=1, get told its incarnation is stale, and then increment is incarnation appropriately for the next message. So the nodes in the "black hole" will converge on getting an alive message for that node, with incarnation higher than any dead/suspect message they've dropped anyways.