diff --git a/consensus/src/counters.rs b/consensus/src/counters.rs index fcbebf6eeb6..783e312cbfc 100644 --- a/consensus/src/counters.rs +++ b/consensus/src/counters.rs @@ -342,6 +342,34 @@ pub static WAIT_FOR_FULL_BLOCKS_TRIGGERED: Lazy = Lazy::new(|| { ) }); +/// Duration of the full pull loop (outer loop with retries) in seconds. +/// Custom buckets cover 0–1s so the 250–500ms default-bucket gap doesn't +/// hide the 300ms poll ceiling. +pub static PULL_LOOP_DURATION: Lazy = Lazy::new(|| { + register_histogram!( + "aptos_consensus_pull_loop_duration_seconds", + "Duration of the full payload pull loop including retries", + // Sub-ms to 10ms for fast path, then 30ms steps (matching NO_TXN_DELAY) + // up to 330ms, then coarser up to 1s for non-default configs. + vec![ + 0.001, 0.002, 0.005, 0.01, 0.03, 0.06, 0.09, 0.12, 0.15, 0.18, 0.21, 0.24, 0.27, 0.30, + 0.33, 0.5, 0.75, 1.0, + ], + ) + .unwrap() +}); + +/// Number of empty retries in the pull loop before getting a payload. +/// Buckets cover up to 34 retries (enough for 1000ms non-default poll time). +pub static PULL_LOOP_EMPTY_RETRIES: Lazy = Lazy::new(|| { + register_histogram!( + "aptos_consensus_pull_loop_empty_retries", + "Number of empty retries in the pull loop", + vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 15.0, 20.0, 25.0, 30.0, 35.0,], + ) + .unwrap() +}); + /// Counts when pipeline backpressure is triggered pub static PIPELINE_BACKPRESSURE_ON_PROPOSAL_TRIGGERED: Lazy = Lazy::new(|| { register_avg_counter( diff --git a/consensus/src/payload_client/user/quorum_store_client.rs b/consensus/src/payload_client/user/quorum_store_client.rs index 8e9b0ef6f14..451ac32b662 100644 --- a/consensus/src/payload_client/user/quorum_store_client.rs +++ b/consensus/src/payload_client/user/quorum_store_client.rs @@ -2,7 +2,9 @@ // Licensed pursuant to the Innovation-Enabling Source Code License, available at https://github.com/aptos-labs/aptos-core/blob/main/LICENSE use crate::{ - counters::WAIT_FOR_FULL_BLOCKS_TRIGGERED, error::QuorumStoreError, monitor, + counters::{PULL_LOOP_DURATION, PULL_LOOP_EMPTY_RETRIES, WAIT_FOR_FULL_BLOCKS_TRIGGERED}, + error::QuorumStoreError, + monitor, payload_client::user::UserPayloadClient, }; use aptos_consensus_types::{ @@ -105,6 +107,7 @@ impl UserPayloadClient for QuorumStoreClient { }); // keep polling QuorumStore until there's payloads available or there's still pending payloads let start_time = Instant::now(); + let mut empty_retries: u64 = 0; let payload = loop { // Make sure we don't wait more than expected, due to thread scheduling delays/processing time consumed @@ -122,15 +125,20 @@ impl UserPayloadClient for QuorumStoreClient { ) .await?; if payload.is_empty() && !return_empty && !done { + empty_retries += 1; sleep(Duration::from_millis(NO_TXN_DELAY)).await; continue; } break payload; }; + let pull_duration = start_time.elapsed(); + PULL_LOOP_DURATION.observe(pull_duration.as_secs_f64()); + PULL_LOOP_EMPTY_RETRIES.observe(empty_retries as f64); debug!( pull_params = ?params, - duration_ms = start_time.elapsed().as_millis() as u64, + duration_ms = pull_duration.as_millis() as u64, payload_len = payload.len(), + empty_retries = empty_retries, return_empty = return_empty, return_non_full = return_non_full, "Pull payloads from QuorumStore: proposal"