diff --git a/aptos-node/src/lib.rs b/aptos-node/src/lib.rs index 572a153d66c..c16a58138c4 100644 --- a/aptos-node/src/lib.rs +++ b/aptos-node/src/lib.rs @@ -26,6 +26,7 @@ use aptos_build_info::build_information; use aptos_config::config::{merge_node_config, NodeConfig, PersistableConfig}; use aptos_framework::ReleaseBundle; use aptos_genesis::builder::GenesisConfiguration; +use aptos_inspection_service::server::InspectionServiceComponents; use aptos_logger::{prelude::*, telemetry_log_writer::TelemetryLog, Level, LoggerFilterUpdater}; use aptos_state_sync_driver::driver_factory::StateSyncRuntime; use aptos_types::{ @@ -701,6 +702,14 @@ pub fn setup_environment_and_start_node( // Starts the admin service let mut admin_service = services::start_admin_service(&node_config); + // Start the inspection service (port 9101) early — before RocksDB — so that + // Prometheus metrics are scrapeable from the very first moments of startup. + // Components that require a fully-initialised node (peer information) will + // return 503 until `inspection_components.set(...)` is called below. + let inspection_components = Arc::new(InspectionServiceComponents::new()); + let inspection_service_runtime = + services::start_node_inspection_service(&node_config, inspection_components.clone()); + // Initialize transaction tracing from config { let tracing_cfg = &node_config.transaction_tracing; @@ -805,12 +814,9 @@ pub fn setup_environment_and_start_node( db_rw.clone(), )?; - // Start the node inspection service - let inspection_service_runtime = services::start_node_inspection_service( - &node_config, - aptos_data_client, - peers_and_metadata.clone(), - ); + // Inject the now-available components into the already-running inspection service. + // This unblocks /peer_information (and any other endpoints that need these values). + inspection_components.set(aptos_data_client, peers_and_metadata.clone()); // Bootstrap the API and transaction streaming services let ( diff --git a/aptos-node/src/services.rs b/aptos-node/src/services.rs index 6e8e790e347..6f502317c61 100644 --- a/aptos-node/src/services.rs +++ b/aptos-node/src/services.rs @@ -11,13 +11,13 @@ use aptos_consensus::{ quorum_store::quorum_store_db::QuorumStoreDB, }; use aptos_consensus_notifications::ConsensusNotifier; -use aptos_data_client::client::AptosDataClient; use aptos_db_indexer::{db_indexer::InternalIndexerDB, indexer_reader::IndexerReaders}; use aptos_event_notifications::{DbBackedOnChainConfig, ReconfigNotificationListener}; use aptos_indexer_grpc_fullnode::runtime::bootstrap as bootstrap_indexer_grpc; use aptos_indexer_grpc_table_info::runtime::{ bootstrap as bootstrap_indexer_table_info, bootstrap_internal_indexer_db, }; +use aptos_inspection_service::server::InspectionServiceComponents; use aptos_logger::{debug, telemetry_log_writer::TelemetryLog, LoggerFilterUpdater}; use aptos_mempool::{ network::MempoolSyncMsg, MempoolClientRequest, MempoolClientSender, QuorumStoreRequest, @@ -202,17 +202,16 @@ pub fn start_admin_service(node_config: &NodeConfig) -> AdminService { AdminService::new(node_config) } -/// Starts the node inspection service and returns the runtime +/// Starts the node inspection service and returns the runtime. +/// +/// Pass an `Arc` whose fields are initially `None`. +/// After the rest of the node has initialised, call `components.set(...)` to +/// inject the live values, at which point all endpoints will become fully operational. pub fn start_node_inspection_service( node_config: &NodeConfig, - aptos_data_client: AptosDataClient, - peers_and_metadata: Arc, + components: Arc, ) -> Runtime { - aptos_inspection_service::start_inspection_service( - node_config.clone(), - aptos_data_client, - peers_and_metadata, - ) + aptos_inspection_service::start_inspection_service(node_config.clone(), components) } /// Starts the peer monitoring service and returns the runtime diff --git a/crates/aptos-inspection-service/src/server/mod.rs b/crates/aptos-inspection-service/src/server/mod.rs index 3744b3f8aab..37f5b224ed0 100644 --- a/crates/aptos-inspection-service/src/server/mod.rs +++ b/crates/aptos-inspection-service/src/server/mod.rs @@ -13,10 +13,37 @@ use hyper::{ use std::{ convert::Infallible, net::{SocketAddr, ToSocketAddrs}, - sync::Arc, + sync::{Arc, OnceLock}, }; use tokio::runtime::Runtime; +/// Holds the components that are injected into the inspection service after it starts. +/// Uses `OnceLock` so the service can start before these are available. +#[derive(Default)] +pub struct InspectionServiceComponents { + pub data_client: OnceLock, + pub peers_and_metadata: OnceLock>, +} + +impl InspectionServiceComponents { + pub fn new() -> Self { + Self { + data_client: OnceLock::new(), + peers_and_metadata: OnceLock::new(), + } + } + + /// Inject both components once they are available. + pub fn set(&self, data_client: AptosDataClient, peers_and_metadata: Arc) { + self.data_client + .set(data_client) + .expect("data_client already set"); + self.peers_and_metadata + .set(peers_and_metadata) + .expect("peers_and_metadata already set"); + } +} + mod configuration; mod identity_information; mod index; @@ -48,10 +75,14 @@ pub const UNEXPECTED_ERROR_MESSAGE: &str = "An unexpected error was encountered! /// Starts the inspection service that listens on the configured /// address and handles various endpoint requests. Returns the /// runtime so the caller can keep it alive. +/// +/// `components` is an `Arc` whose fields start as +/// `None` and are filled in via `components.set(...)` once the rest of the node +/// has finished initialising. Until then, endpoints that require those values +/// (e.g. `/peer_information`) will return 503. pub fn start_inspection_service( node_config: NodeConfig, - aptos_data_client: AptosDataClient, - peers_and_metadata: Arc, + components: Arc, ) -> Runtime { // Fetch the service port and address let service_port = node_config.inspection_service.port; @@ -80,16 +111,10 @@ pub fn start_inspection_service( // Create the service function that handles the endpoint requests let make_service = make_service_fn(move |_conn| { let node_config = node_config.clone(); - let aptos_data_client = aptos_data_client.clone(); - let peers_and_metadata = peers_and_metadata.clone(); + let components = components.clone(); async move { Ok::<_, Infallible>(service_fn(move |request| { - serve_requests( - request, - node_config.clone(), - aptos_data_client.clone(), - peers_and_metadata.clone(), - ) + serve_requests(request, node_config.clone(), components.clone()) })) } }); @@ -106,9 +131,12 @@ pub fn start_inspection_service( async fn serve_requests( req: Request, node_config: NodeConfig, - aptos_data_client: AptosDataClient, - peers_and_metadata: Arc, + components: Arc, ) -> Result, hyper::Error> { + // Read the optional components (may be None during early startup) + let aptos_data_client = components.data_client.get().cloned(); + let peers_and_metadata = components.peers_and_metadata.get().cloned(); + // Process the request and get the response components let (status_code, body, content_type) = match req.uri().path() { CONFIGURATION_PATH => { diff --git a/crates/aptos-inspection-service/src/server/peer_information.rs b/crates/aptos-inspection-service/src/server/peer_information.rs index da0307faf30..825f2c2e9dc 100644 --- a/crates/aptos-inspection-service/src/server/peer_information.rs +++ b/crates/aptos-inspection-service/src/server/peer_information.rs @@ -17,21 +17,33 @@ use std::{collections::BTreeMap, ops::Deref, sync::Arc}; pub const PEER_INFO_DISABLED_MESSAGE: &str = "This endpoint is disabled! Enable it in the node config at inspection_service.expose_peer_information: true"; -/// Handles a new peer information request +// The message to display while the node is still initializing +pub const PEER_INFO_INITIALIZING_MESSAGE: &str = + "Node is still initializing — peer information is not yet available"; + +/// Handles a new peer information request. +/// +/// `aptos_data_client` and `peers_and_metadata` are `None` during early startup +/// (before the rest of the node has initialised). In that case the endpoint +/// returns 503 so callers know to retry rather than treating it as a hard error. pub fn handle_peer_information_request( node_config: &NodeConfig, - aptos_data_client: AptosDataClient, - peers_and_metadata: Arc, + aptos_data_client: Option, + peers_and_metadata: Option>, ) -> (StatusCode, Body, String) { - // Only return peer information if the endpoint is enabled - let (status_code, body) = if node_config.inspection_service.expose_peer_information { - let peer_information = get_peer_information(aptos_data_client, peers_and_metadata); - (StatusCode::OK, Body::from(peer_information)) - } else { - ( + let (status_code, body) = match (aptos_data_client, peers_and_metadata) { + _ if !node_config.inspection_service.expose_peer_information => ( StatusCode::FORBIDDEN, Body::from(PEER_INFO_DISABLED_MESSAGE), - ) + ), + (Some(data_client), Some(pam)) => { + let peer_information = get_peer_information(data_client, pam); + (StatusCode::OK, Body::from(peer_information)) + }, + _ => ( + StatusCode::SERVICE_UNAVAILABLE, + Body::from(PEER_INFO_INITIALIZING_MESSAGE), + ), }; (status_code, body, CONTENT_TYPE_TEXT.into()) diff --git a/crates/aptos-inspection-service/src/server/tests.rs b/crates/aptos-inspection-service/src/server/tests.rs index 90bd4af1980..502e4509410 100644 --- a/crates/aptos-inspection-service/src/server/tests.rs +++ b/crates/aptos-inspection-service/src/server/tests.rs @@ -7,6 +7,7 @@ use crate::{ identity_information::IDENTITY_INFO_DISABLED_MESSAGE, peer_information::PEER_INFO_DISABLED_MESSAGE, serve_requests, system_information::SYS_INFO_DISABLED_MESSAGE, utils::get_all_metrics, + InspectionServiceComponents, }, CONFIGURATION_PATH, FORGE_METRICS_PATH, IDENTITY_INFORMATION_PATH, INDEX_PATH, JSON_METRICS_PATH, METRICS_PATH, PEER_INFORMATION_PATH, SYSTEM_INFORMATION_PATH, @@ -280,7 +281,9 @@ fn test_publish_metrics() { assert_approx_eq!(1.0, metrics.first().unwrap().get_counter().get_value()); } -// Exercise the serve_requests() handler with a GET request to the given path +// Exercise the serve_requests() handler with a GET request to the given path. +// Components are pre-populated with live values so all endpoints behave as they +// would on a fully-initialised node. async fn send_get_request_to_path(config: &NodeConfig, endpoint: &str) -> Response { // Build the URI let uri = format!("http://127.0.0.1:9201{}", endpoint); @@ -300,6 +303,10 @@ async fn send_get_request_to_path(config: &NodeConfig, endpoint: &str) -> Respon None, ); + // Build fully-populated inspection components + let components = Arc::new(InspectionServiceComponents::new()); + components.set(aptos_data_client, peers_and_metadata); + // Serve the request serve_requests( Request::builder() @@ -308,8 +315,7 @@ async fn send_get_request_to_path(config: &NodeConfig, endpoint: &str) -> Respon .body(Body::from("")) .unwrap(), config.clone(), - aptos_data_client, - peers_and_metadata, + components, ) .await .unwrap()