From ffda1c38c4a0ffb357d29cf4b9337740e8a51516 Mon Sep 17 00:00:00 2001 From: Martin Raszyk Date: Fri, 8 May 2026 15:37:34 +0000 Subject: [PATCH 1/9] chore: prune streams to deleted subnets --- rs/messaging/src/state_machine.rs | 3 ++ rs/messaging/src/state_machine/tests.rs | 38 +++++++++++++++++++++ rs/replicated_state/src/replicated_state.rs | 19 +++++++++++ 3 files changed, 60 insertions(+) diff --git a/rs/messaging/src/state_machine.rs b/rs/messaging/src/state_machine.rs index 880acf043eab..2a59809eb098 100644 --- a/rs/messaging/src/state_machine.rs +++ b/rs/messaging/src/state_machine.rs @@ -234,6 +234,9 @@ impl StateMachine for StateMachineImpl { self.observe_phase_duration(PHASE_INDUCTION, &since); + // Discard streams to subnets no longer present in the network topology. + state_with_messages.discard_streams_for_deleted_subnets(); + let execution_round_type = if requires_full_state_hash { ExecutionRoundType::CheckpointRound } else { diff --git a/rs/messaging/src/state_machine/tests.rs b/rs/messaging/src/state_machine/tests.rs index 7738d7325c01..d96b0db65ba2 100644 --- a/rs/messaging/src/state_machine/tests.rs +++ b/rs/messaging/src/state_machine/tests.rs @@ -10,6 +10,7 @@ use ic_registry_subnet_features::SubnetFeatures; use ic_registry_subnet_type::SubnetType; use ic_replicated_state::{ ReplicatedState, SubnetTopology, metadata_state::testing::NetworkTopologyTesting, + testing::ReplicatedStateTesting, }; use ic_test_utilities_execution_environment::test_registry_settings; use ic_test_utilities_logger::with_test_replica_logger; @@ -249,6 +250,43 @@ fn test_delivered_batch_interface() { } } +#[test] +fn state_machine_discards_stream_for_deleted_subnet() { + let provided_batch = BatchBuilder::new().batch_number(Height::new(1)).build(); + let fixture = test_fixture(&provided_batch); + + // Add a stream to SUBNET_2, which is not present in the fixture's network topology. + let mut initial_state = fixture.initial_state; + initial_state.modify_streams(|streams| { + streams.insert(SUBNET_2, Default::default()); + }); + assert!(initial_state.get_stream(&SUBNET_2).is_some()); + + with_test_replica_logger(|log| { + let state_machine = Box::new(StateMachineImpl::new( + fixture.scheduler, + fixture.demux, + fixture.stream_builder, + Default::default(), + log, + fixture.metrics, + )); + + let state = state_machine.execute_round( + initial_state, + fixture.network_topology.clone(), + provided_batch, + Default::default(), + Default::default(), + &test_registry_settings(), + Default::default(), + Default::default(), + ); + + assert!(state.get_stream(&SUBNET_2).is_none()); + }); +} + const NNS_SUBNET_ID: SubnetId = SUBNET_0; const SUBNET_A: SubnetId = SUBNET_1; const SUBNET_B: SubnetId = SUBNET_2; diff --git a/rs/replicated_state/src/replicated_state.rs b/rs/replicated_state/src/replicated_state.rs index d94c2a04c1b9..56d29d324334 100644 --- a/rs/replicated_state/src/replicated_state.rs +++ b/rs/replicated_state/src/replicated_state.rs @@ -855,6 +855,25 @@ impl ReplicatedState { self.metadata.streams.get(destination_subnet_id) } + /// Discards streams to subnets no longer present in the network topology. + /// + /// Called after the induction phase of each round, once the new + /// `NetworkTopology` (reflecting any registry deletions) has been applied. + /// Safe to call because by the time the registry deletion takes effect, all + /// certified stream slices from the deleted subnet have already been inducted + /// in the same round, and no new certified slices can be produced once the + /// subnet's key is removed from the registry. + pub fn discard_streams_for_deleted_subnets(&mut self) { + let mut streams = self.take_streams(); + streams.retain(|subnet_id, _| { + self.metadata + .network_topology + .subnets() + .contains_key(subnet_id) + }); + self.put_streams(streams); + } + /// Returns the sum of reserved compute allocations of all currently /// available canisters. pub fn total_compute_allocation(&self) -> u64 { From b9234458acc446a7ea547dd04880c9144b242924 Mon Sep 17 00:00:00 2001 From: Martin Raszyk Date: Fri, 8 May 2026 15:50:21 +0000 Subject: [PATCH 2/9] do not raise critical error for discarded best-effort responses --- rs/messaging/src/routing/stream_builder.rs | 27 ++++++++++++++-------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/rs/messaging/src/routing/stream_builder.rs b/rs/messaging/src/routing/stream_builder.rs index 48c2dfa02d32..3d78b2822c0f 100644 --- a/rs/messaging/src/routing/stream_builder.rs +++ b/rs/messaging/src/routing/stream_builder.rs @@ -575,15 +575,24 @@ impl StreamBuilderImpl { } RequestOrResponse::Response(rep) => { // A Response: discard it. - error!( - self.log, - "{}: Discarding response, destination not found: {:?}", - CRITICAL_ERROR_RESPONSE_DESTINATION_NOT_FOUND, - rep - ); - self.metrics - .critical_error_response_destination_not_found - .inc(); + if rep.is_best_effort() { + // Expected when the destination subnet has been deleted. + warn!( + self.log, + "Discarding best-effort response, destination not found: {:?}", + rep + ); + } else { + error!( + self.log, + "{}: Discarding response, destination not found: {:?}", + CRITICAL_ERROR_RESPONSE_DESTINATION_NOT_FOUND, + rep + ); + self.metrics + .critical_error_response_destination_not_found + .inc(); + } } } } From bb49a4a8a4133dcc13044e32878c00d88ad64963 Mon Sep 17 00:00:00 2001 From: Martin Raszyk Date: Sun, 10 May 2026 08:28:43 +0000 Subject: [PATCH 3/9] harden test of message handling to deleted subnet --- rs/messaging/src/state_machine/tests.rs | 204 +++++++++++++++++++++--- 1 file changed, 185 insertions(+), 19 deletions(-) diff --git a/rs/messaging/src/state_machine/tests.rs b/rs/messaging/src/state_machine/tests.rs index d96b0db65ba2..7a5c80607da6 100644 --- a/rs/messaging/src/state_machine/tests.rs +++ b/rs/messaging/src/state_machine/tests.rs @@ -1,26 +1,31 @@ use super::*; -use crate::message_routing::CRITICAL_ERROR_NON_INCREASING_BATCH_TIME; +use crate::message_routing::{CRITICAL_ERROR_NON_INCREASING_BATCH_TIME, LatencyMetrics}; use crate::routing::demux::MockDemux; -use crate::routing::stream_builder::MockStreamBuilder; +use crate::routing::stream_builder::{MockStreamBuilder, StreamBuilderImpl}; use crate::state_machine::StateMachineImpl; +use ic_config::message_routing::{MAX_STREAM_MESSAGES, TARGET_STREAM_SIZE_BYTES}; use ic_interfaces::execution_environment::Scheduler; +use ic_limits::SYSTEM_SUBNET_STREAM_MSG_LIMIT; use ic_metrics::MetricsRegistry; use ic_registry_routing_table::{CANISTER_IDS_PER_SUBNET, CanisterIdRange, RoutingTable}; use ic_registry_subnet_features::SubnetFeatures; use ic_registry_subnet_type::SubnetType; use ic_replicated_state::{ - ReplicatedState, SubnetTopology, metadata_state::testing::NetworkTopologyTesting, - testing::ReplicatedStateTesting, + InputQueueType, ReplicatedState, SubnetTopology, + metadata_state::testing::NetworkTopologyTesting, testing::ReplicatedStateTesting, }; use ic_test_utilities_execution_environment::test_registry_settings; use ic_test_utilities_logger::with_test_replica_logger; use ic_test_utilities_metrics::{fetch_int_counter_vec, nonzero_values}; -use ic_test_utilities_state::new_canister_state; +use ic_test_utilities_state::{new_canister_state, register_callback}; use ic_test_utilities_types::batch::BatchBuilder; -use ic_test_utilities_types::ids::{SUBNET_0, SUBNET_1, SUBNET_2}; -use ic_test_utilities_types::messages::SignedIngressBuilder; +use ic_test_utilities_types::ids::{SUBNET_0, SUBNET_1, SUBNET_2, canister_test_id}; +use ic_test_utilities_types::messages::{RequestBuilder, SignedIngressBuilder}; use ic_types::batch::{BatchMessages, BlockmakerMetrics, ChainKeyData}; -use ic_types::messages::SignedIngress; +use ic_types::messages::{ + CallbackId, CanisterMessage, NO_DEADLINE, Payload, Response, SignedIngress, +}; +use ic_types::time::{CoarseTime, UNIX_EPOCH}; use ic_types::{ CanisterId, Height, PrincipalId, Randomness, RegistryVersion, ReplicaVersion, Time, }; @@ -28,6 +33,7 @@ use ic_types_cycles::{CanisterCyclesCostSchedule, Cycles}; use maplit::btreemap; use mockall::{Sequence, mock, predicate::*}; use std::collections::{BTreeMap, BTreeSet}; +use std::sync::{Arc, Mutex}; use std::time::Duration; mock! { @@ -251,30 +257,163 @@ fn test_delivered_batch_interface() { } #[test] -fn state_machine_discards_stream_for_deleted_subnet() { - let provided_batch = BatchBuilder::new().batch_number(Height::new(1)).build(); - let fixture = test_fixture(&provided_batch); +fn state_machine_handles_messages_to_deleted_subnet() { + let provided_batch = BatchBuilder::new() + .batch_number(Height::new(1)) + .time(Time::from_nanos_since_unix_epoch(1)) + .build(); + + let mut demux = Box::new(MockDemux::new()); + demux + .expect_process_payload() + .times(1) + .returning(|state, _, _| state); + + let mut scheduler = Box::new(MockScheduler::new()); + scheduler + .expect_execute_round() + .times(1) + .returning(|state, _, _, _, _, _, _, _| state); - // Add a stream to SUBNET_2, which is not present in the fixture's network topology. - let mut initial_state = fixture.initial_state; + // Initial state with a stream to SUBNET_2, which is not in the network topology. + let mut initial_state = ReplicatedState::new(SUBNET_1, SubnetType::Application); initial_state.modify_streams(|streams| { streams.insert(SUBNET_2, Default::default()); }); assert!(initial_state.get_stream(&SUBNET_2).is_some()); + // Add a canister with an output request, a best-effort output response, and a subnet + // message (callee = the deleted subnet's ID), all destined for the deleted subnet. + let local_canister_id = canister_test_id(0); + // Use a canister ID outside the routing table range so it has no route, + // causing the stream builder to generate a reject for the output request. + let remote_canister_id = CanisterId::from_u64(CANISTER_IDS_PER_SUBNET); + let mut canister_state = new_canister_state( + local_canister_id, + PrincipalId::new_anonymous(), + Cycles::new(1_000_000_000_000), + 3600.into(), + ); + + // Output request: local → remote (on the deleted subnet). + // Requests with no route get a reject response — no critical error. + let callback_id = register_callback(&mut canister_state, remote_canister_id, NO_DEADLINE); + canister_state + .push_output_request( + Arc::new( + RequestBuilder::new() + .sender(local_canister_id) + .receiver(remote_canister_id) + .sender_reply_callback(callback_id) + .build(), + ), + UNIX_EPOCH, + ) + .unwrap(); + + // Output response: local → remote (best-effort to avoid critical error). + // Best-effort responses with no route are dropped without a critical error. + // First push then pop a matching input request to create the output-queue reservation. + let response_deadline = CoarseTime::from_secs_since_unix_epoch(u32::MAX); + let mut subnet_available_memory = i64::MAX / 2; + canister_state + .push_input( + RequestBuilder::new() + .sender(remote_canister_id) + .receiver(local_canister_id) + .build() + .into(), + &mut subnet_available_memory, + SubnetType::Application, + InputQueueType::RemoteSubnet, + ) + .unwrap(); + canister_state.pop_input().unwrap(); + canister_state.push_output_response(Arc::new(Response { + originator: remote_canister_id, + respondent: local_canister_id, + originator_reply_callback: CallbackId::from(0), + refund: Cycles::zero(), + response_payload: Payload::Data(vec![]), + deadline: response_deadline, + })); + + // Output subnet message: local → SUBNET_2 (callee = the deleted subnet's ID). + // Subnet messages with no route get a reject response — no critical error. + let subnet_as_canister_id = CanisterId::from(SUBNET_2); + let subnet_callback_id = + register_callback(&mut canister_state, subnet_as_canister_id, NO_DEADLINE); + canister_state + .push_output_request( + Arc::new( + RequestBuilder::new() + .sender(local_canister_id) + .receiver(subnet_as_canister_id) + .sender_reply_callback(subnet_callback_id) + .build(), + ), + UNIX_EPOCH, + ) + .unwrap(); + + initial_state.put_canister_state(canister_state); + + // Network topology with only SUBNET_0 (NNS) and SUBNET_1 (local); SUBNET_2 is absent. + let mut subnets = BTreeMap::new(); + subnets.insert( + SUBNET_0, + SubnetTopology { + public_key: vec![0, 1, 2, 3], + nodes: BTreeSet::new(), + subnet_type: SubnetType::Application, + subnet_features: SubnetFeatures::default(), + chain_keys_held: BTreeSet::new(), + cost_schedule: CanisterCyclesCostSchedule::Normal, + subnet_admins: BTreeSet::new(), + }, + ); + subnets.insert(SUBNET_1, SubnetTopology::default()); + let mut network_topology = NetworkTopology::default(); + network_topology.nns_subnet_id = SUBNET_0; + network_topology.set_subnets(subnets); + network_topology.set_routing_table( + RoutingTable::try_from(btreemap! { + CanisterIdRange { + start: CanisterId::from_u64(0), + end: CanisterId::from_u64(CANISTER_IDS_PER_SUBNET - 1), + } => SUBNET_1, + }) + .unwrap(), + ); + with_test_replica_logger(|log| { + let metrics_registry = MetricsRegistry::new(); + let message_routing_metrics = MessageRoutingMetrics::new(&metrics_registry); + let stream_builder = Box::new(StreamBuilderImpl::new( + SUBNET_1, + MAX_STREAM_MESSAGES, + TARGET_STREAM_SIZE_BYTES, + SYSTEM_SUBNET_STREAM_MSG_LIMIT, + &metrics_registry, + &message_routing_metrics, + Arc::new(Mutex::new(LatencyMetrics::new_time_in_stream( + &metrics_registry, + ))), + log.clone(), + )); + let state_machine = Box::new(StateMachineImpl::new( - fixture.scheduler, - fixture.demux, - fixture.stream_builder, + scheduler, + demux, + stream_builder, Default::default(), log, - fixture.metrics, + message_routing_metrics, )); - let state = state_machine.execute_round( + let mut state = state_machine.execute_round( initial_state, - fixture.network_topology.clone(), + network_topology, provided_batch, Default::default(), Default::default(), @@ -283,7 +422,34 @@ fn state_machine_discards_stream_for_deleted_subnet() { Default::default(), ); + // Stream to the deleted subnet is gone. assert!(state.get_stream(&SUBNET_2).is_none()); + // Output queue is empty: request was consumed (reject response generated), + // best-effort response was dropped. + assert!( + !state + .canister_state(&local_canister_id) + .unwrap() + .has_output() + ); + // Reject responses for both unroutable output requests (canister request and subnet + // message) are in the local canister's input queue. + let canister = Arc::make_mut(state.canister_state_mut_arc(&local_canister_id).unwrap()); + let msg1 = canister.pop_input().unwrap(); + let msg2 = canister.pop_input().unwrap(); + assert!(canister.pop_input().is_none()); + for msg in [msg1, msg2] { + assert!(matches!( + msg, + CanisterMessage::Response { response, .. } + if response.originator == local_canister_id + && matches!(response.response_payload, Payload::Reject(_)) + )); + } + // No critical error was raised. + assert!( + nonzero_values(fetch_int_counter_vec(&metrics_registry, "critical_errors")).is_empty() + ); }); } From 43b64ced8e48761eda60f5f17db4d964910dd619 Mon Sep 17 00:00:00 2001 From: Martin Raszyk Date: Mon, 11 May 2026 06:12:36 +0000 Subject: [PATCH 4/9] tests --- rs/messaging/src/state_machine/tests.rs | 33 ++++++++++++------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/rs/messaging/src/state_machine/tests.rs b/rs/messaging/src/state_machine/tests.rs index 7a5c80607da6..7f896b19902b 100644 --- a/rs/messaging/src/state_machine/tests.rs +++ b/rs/messaging/src/state_machine/tests.rs @@ -19,12 +19,10 @@ use ic_test_utilities_logger::with_test_replica_logger; use ic_test_utilities_metrics::{fetch_int_counter_vec, nonzero_values}; use ic_test_utilities_state::{new_canister_state, register_callback}; use ic_test_utilities_types::batch::BatchBuilder; -use ic_test_utilities_types::ids::{SUBNET_0, SUBNET_1, SUBNET_2, canister_test_id}; +use ic_test_utilities_types::ids::{SUBNET_0, SUBNET_1, SUBNET_2}; use ic_test_utilities_types::messages::{RequestBuilder, SignedIngressBuilder}; use ic_types::batch::{BatchMessages, BlockmakerMetrics, ChainKeyData}; -use ic_types::messages::{ - CallbackId, CanisterMessage, NO_DEADLINE, Payload, Response, SignedIngress, -}; +use ic_types::messages::{CallbackId, CanisterMessage, Payload, Response, SignedIngress}; use ic_types::time::{CoarseTime, UNIX_EPOCH}; use ic_types::{ CanisterId, Height, PrincipalId, Randomness, RegistryVersion, ReplicaVersion, Time, @@ -282,12 +280,14 @@ fn state_machine_handles_messages_to_deleted_subnet() { }); assert!(initial_state.get_stream(&SUBNET_2).is_some()); - // Add a canister with an output request, a best-effort output response, and a subnet - // message (callee = the deleted subnet's ID), all destined for the deleted subnet. - let local_canister_id = canister_test_id(0); + // Add a canister with a bounded-wait output request, a best-effort output response, and a + // bounded-wait subnet message (callee = the deleted subnet's ID), all destined for the + // deleted subnet. + let local_canister_id = CANISTER_RANGE_A.start; // Use a canister ID outside the routing table range so it has no route, // causing the stream builder to generate a reject for the output request. - let remote_canister_id = CanisterId::from_u64(CANISTER_IDS_PER_SUBNET); + let remote_canister_id = CANISTER_RANGE_B.start; + let deadline = CoarseTime::from_secs_since_unix_epoch(u32::MAX); let mut canister_state = new_canister_state( local_canister_id, PrincipalId::new_anonymous(), @@ -297,7 +297,7 @@ fn state_machine_handles_messages_to_deleted_subnet() { // Output request: local → remote (on the deleted subnet). // Requests with no route get a reject response — no critical error. - let callback_id = register_callback(&mut canister_state, remote_canister_id, NO_DEADLINE); + let callback_id = register_callback(&mut canister_state, remote_canister_id, deadline); canister_state .push_output_request( Arc::new( @@ -305,6 +305,7 @@ fn state_machine_handles_messages_to_deleted_subnet() { .sender(local_canister_id) .receiver(remote_canister_id) .sender_reply_callback(callback_id) + .deadline(deadline) .build(), ), UNIX_EPOCH, @@ -314,7 +315,6 @@ fn state_machine_handles_messages_to_deleted_subnet() { // Output response: local → remote (best-effort to avoid critical error). // Best-effort responses with no route are dropped without a critical error. // First push then pop a matching input request to create the output-queue reservation. - let response_deadline = CoarseTime::from_secs_since_unix_epoch(u32::MAX); let mut subnet_available_memory = i64::MAX / 2; canister_state .push_input( @@ -335,14 +335,14 @@ fn state_machine_handles_messages_to_deleted_subnet() { originator_reply_callback: CallbackId::from(0), refund: Cycles::zero(), response_payload: Payload::Data(vec![]), - deadline: response_deadline, + deadline, })); // Output subnet message: local → SUBNET_2 (callee = the deleted subnet's ID). // Subnet messages with no route get a reject response — no critical error. let subnet_as_canister_id = CanisterId::from(SUBNET_2); let subnet_callback_id = - register_callback(&mut canister_state, subnet_as_canister_id, NO_DEADLINE); + register_callback(&mut canister_state, subnet_as_canister_id, deadline); canister_state .push_output_request( Arc::new( @@ -350,6 +350,7 @@ fn state_machine_handles_messages_to_deleted_subnet() { .sender(local_canister_id) .receiver(subnet_as_canister_id) .sender_reply_callback(subnet_callback_id) + .deadline(deadline) .build(), ), UNIX_EPOCH, @@ -365,7 +366,7 @@ fn state_machine_handles_messages_to_deleted_subnet() { SubnetTopology { public_key: vec![0, 1, 2, 3], nodes: BTreeSet::new(), - subnet_type: SubnetType::Application, + subnet_type: SubnetType::System, subnet_features: SubnetFeatures::default(), chain_keys_held: BTreeSet::new(), cost_schedule: CanisterCyclesCostSchedule::Normal, @@ -378,10 +379,8 @@ fn state_machine_handles_messages_to_deleted_subnet() { network_topology.set_subnets(subnets); network_topology.set_routing_table( RoutingTable::try_from(btreemap! { - CanisterIdRange { - start: CanisterId::from_u64(0), - end: CanisterId::from_u64(CANISTER_IDS_PER_SUBNET - 1), - } => SUBNET_1, + CANISTER_RANGE_NNS => SUBNET_0, + CANISTER_RANGE_A => SUBNET_1, }) .unwrap(), ); From 59e8329af7b95a192fe56a56dcd6a5d022d3c859 Mon Sep 17 00:00:00 2001 From: Martin Raszyk Date: Mon, 11 May 2026 06:20:35 +0000 Subject: [PATCH 5/9] tests --- rs/messaging/src/state_machine/tests.rs | 32 ++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/rs/messaging/src/state_machine/tests.rs b/rs/messaging/src/state_machine/tests.rs index 7f896b19902b..f7e1004ecbbf 100644 --- a/rs/messaging/src/state_machine/tests.rs +++ b/rs/messaging/src/state_machine/tests.rs @@ -282,7 +282,8 @@ fn state_machine_handles_messages_to_deleted_subnet() { // Add a canister with a bounded-wait output request, a best-effort output response, and a // bounded-wait subnet message (callee = the deleted subnet's ID), all destined for the - // deleted subnet. + // deleted subnet. Also add a bounded-wait subnet output response (remote canister → + // local subnet) to the subnet queues. let local_canister_id = CANISTER_RANGE_A.start; // Use a canister ID outside the routing table range so it has no route, // causing the stream builder to generate a reject for the output request. @@ -359,6 +360,30 @@ fn state_machine_handles_messages_to_deleted_subnet() { initial_state.put_canister_state(canister_state); + // Subnet output response: local subnet → remote (bounded-wait). + // Bounded-wait responses with no route are dropped without a critical error. + // First push then pop a matching input request to create the output-queue reservation. + initial_state + .push_input( + RequestBuilder::new() + .sender(remote_canister_id) + .receiver(CanisterId::from(SUBNET_1)) + .deadline(deadline) + .build() + .into(), + &mut subnet_available_memory, + ) + .unwrap(); + initial_state.pop_subnet_input().unwrap(); + initial_state.push_subnet_output_response(Arc::new(Response { + originator: remote_canister_id, + respondent: CanisterId::from(SUBNET_1), + originator_reply_callback: CallbackId::from(0), + refund: Cycles::zero(), + response_payload: Payload::Data(vec![]), + deadline, + })); + // Network topology with only SUBNET_0 (NNS) and SUBNET_1 (local); SUBNET_2 is absent. let mut subnets = BTreeMap::new(); subnets.insert( @@ -423,14 +448,15 @@ fn state_machine_handles_messages_to_deleted_subnet() { // Stream to the deleted subnet is gone. assert!(state.get_stream(&SUBNET_2).is_none()); - // Output queue is empty: request was consumed (reject response generated), - // best-effort response was dropped. + // Output queues are empty: requests were consumed (reject responses generated), + // best-effort and bounded-wait responses were dropped. assert!( !state .canister_state(&local_canister_id) .unwrap() .has_output() ); + assert!(!state.subnet_queues().has_output()); // Reject responses for both unroutable output requests (canister request and subnet // message) are in the local canister's input queue. let canister = Arc::make_mut(state.canister_state_mut_arc(&local_canister_id).unwrap()); From 9120310b11558e1ec437517281fd1c66b2984f92 Mon Sep 17 00:00:00 2001 From: Martin Raszyk Date: Mon, 11 May 2026 06:23:21 +0000 Subject: [PATCH 6/9] typo --- rs/messaging/src/state_machine/tests.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rs/messaging/src/state_machine/tests.rs b/rs/messaging/src/state_machine/tests.rs index f7e1004ecbbf..a0490139981a 100644 --- a/rs/messaging/src/state_machine/tests.rs +++ b/rs/messaging/src/state_machine/tests.rs @@ -282,8 +282,8 @@ fn state_machine_handles_messages_to_deleted_subnet() { // Add a canister with a bounded-wait output request, a best-effort output response, and a // bounded-wait subnet message (callee = the deleted subnet's ID), all destined for the - // deleted subnet. Also add a bounded-wait subnet output response (remote canister → - // local subnet) to the subnet queues. + // deleted subnet. Also add a bounded-wait subnet output response (local subnet → + // remote canister) to the subnet queues. let local_canister_id = CANISTER_RANGE_A.start; // Use a canister ID outside the routing table range so it has no route, // causing the stream builder to generate a reject for the output request. From 1fb0a98b6ae349b164c5c47060277b8dca454e29 Mon Sep 17 00:00:00 2001 From: Martin Raszyk Date: Mon, 11 May 2026 09:13:46 +0000 Subject: [PATCH 7/9] test certified stream slice pruning --- .../tests/certified_slice_pool.rs | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/rs/xnet/payload_builder/tests/certified_slice_pool.rs b/rs/xnet/payload_builder/tests/certified_slice_pool.rs index 396f0c1ef2a0..79cb1eb97aeb 100644 --- a/rs/xnet/payload_builder/tests/certified_slice_pool.rs +++ b/rs/xnet/payload_builder/tests/certified_slice_pool.rs @@ -1342,3 +1342,52 @@ fn pool_take_slice_respects_signal_limit( ); }); } + +/// Tests that a pooled certified stream slice from a subnet that has been +/// deleted (i.e., removed from the subnet list) is garbage collected when +/// `garbage_collect()` is called without that subnet. +#[test_strategy::proptest(ProptestConfig::with_cases(20))] +fn pool_garbage_collect_deleted_subnet( + #[strategy(arb_stream_slice( + 1, // min_size + 10, // max_size + 0, // min_signal_count + 10, // max_signal_count + ))] + test_slice: (Stream, StreamIndex, usize), +) { + let (stream, from, msg_count) = test_slice; + + with_test_replica_logger(|log| { + let stream_position = ExpectedIndices { + message_index: from, + signal_index: stream.signals_end(), + }; + + let fixture = StateManagerFixture::remote(log.clone()).with_stream(DST_SUBNET, stream); + let slice = fixture.get_slice(DST_SUBNET, from, msg_count); + + let mut certified_stream_store = MockCertifiedStreamStore::new(); + certified_stream_store + .expect_decode_certified_stream_slice() + .returning(|_, _, _| Ok(StreamSliceBuilder::new().build())); + let certified_stream_store = Arc::new(certified_stream_store) as Arc<_>; + let mut pool = + CertifiedSlicePool::new(Arc::clone(&certified_stream_store), &MetricsRegistry::new()); + + // Register SRC_SUBNET as a known peer and pool a slice from it. + pool.garbage_collect(btreemap! {SRC_SUBNET => stream_position}); + pool.put(SRC_SUBNET, slice, REGISTRY_VERSION, log).unwrap(); + + // Sanity check: SRC_SUBNET is a known peer with a pooled slice. + assert!(pool.peers().any(|&id| id == SRC_SUBNET)); + assert!(!matches!(pool.slice_stats(SRC_SUBNET), (_, None, 0, 0))); + + // Simulate subnet deletion: call garbage_collect without SRC_SUBNET. + pool.garbage_collect(Default::default()); + + // Both the stream position and the slice should have been dropped. + assert!(pool.peers().next().is_none()); + assert_eq!((None, None, 0, 0), pool.slice_stats(SRC_SUBNET)); + }); +} From fb691e205502528b3344a82d6aadad121734d150 Mon Sep 17 00:00:00 2001 From: Martin Raszyk Date: Mon, 11 May 2026 09:24:30 +0000 Subject: [PATCH 8/9] only silently discard best-effort responses with no cycles --- rs/messaging/src/routing/stream_builder.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rs/messaging/src/routing/stream_builder.rs b/rs/messaging/src/routing/stream_builder.rs index 3d78b2822c0f..1ca3ec42541f 100644 --- a/rs/messaging/src/routing/stream_builder.rs +++ b/rs/messaging/src/routing/stream_builder.rs @@ -575,8 +575,9 @@ impl StreamBuilderImpl { } RequestOrResponse::Response(rep) => { // A Response: discard it. - if rep.is_best_effort() { - // Expected when the destination subnet has been deleted. + if rep.is_best_effort() && rep.refund.is_zero() { + // Expected when the destination subnet has been deleted: best-effort + // responses with no cycles refund can be safely discarded. warn!( self.log, "Discarding best-effort response, destination not found: {:?}", From 519181076b7d783a5180671eca935059623de1bd Mon Sep 17 00:00:00 2001 From: Martin Raszyk Date: Mon, 11 May 2026 09:35:23 +0000 Subject: [PATCH 9/9] debug --- rs/messaging/src/routing/stream_builder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rs/messaging/src/routing/stream_builder.rs b/rs/messaging/src/routing/stream_builder.rs index 1ca3ec42541f..60b857d74922 100644 --- a/rs/messaging/src/routing/stream_builder.rs +++ b/rs/messaging/src/routing/stream_builder.rs @@ -2,7 +2,7 @@ use crate::message_routing::{ CRITICAL_ERROR_INDUCT_RESPONSE_FAILED, LatencyMetrics, MessageRoutingMetrics, }; use ic_error_types::RejectCode; -use ic_logger::{ReplicaLogger, error, warn}; +use ic_logger::{ReplicaLogger, debug, error, warn}; use ic_metrics::{MetricsRegistry, buckets::decimal_buckets}; use ic_registry_subnet_type::SubnetType; use ic_replicated_state::replicated_state::{ @@ -578,7 +578,7 @@ impl StreamBuilderImpl { if rep.is_best_effort() && rep.refund.is_zero() { // Expected when the destination subnet has been deleted: best-effort // responses with no cycles refund can be safely discarded. - warn!( + debug!( self.log, "Discarding best-effort response, destination not found: {:?}", rep