diff --git a/.gitignore b/.gitignore index 018d16c1f..3184dc929 100644 --- a/.gitignore +++ b/.gitignore @@ -41,6 +41,7 @@ test_logs # pre-commit config .pre-commit-config.yaml .cargo +!.cargo/config.toml #gradle files .gradle diff --git a/Cargo.lock b/Cargo.lock index 82a5d741f..081a72582 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5851,6 +5851,7 @@ dependencies = [ "bytes", "http 1.1.0", "opentelemetry", + "reqwest 0.12.8", ] [[package]] @@ -5867,6 +5868,7 @@ dependencies = [ "opentelemetry-proto", "opentelemetry_sdk", "prost", + "reqwest 0.12.8", "serde_json", "thiserror 1.0.58", "tokio", @@ -5874,6 +5876,19 @@ dependencies = [ "tracing", ] +[[package]] +name = "opentelemetry-prometheus" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b834e966ea5e2d03dfe5f2253f03d22cce21403ee940265070eeee96cee0bcc" +dependencies = [ + "once_cell", + "opentelemetry", + "opentelemetry_sdk", + "prometheus", + "protobuf", +] + [[package]] name = "opentelemetry-proto" version = "0.27.0" @@ -6412,6 +6427,21 @@ dependencies = [ "yansi", ] +[[package]] +name = "prometheus" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1" +dependencies = [ + "cfg-if", + "fnv", + "lazy_static", + "memchr", + "parking_lot", + "protobuf", + "thiserror 1.0.58", +] + [[package]] name = "prost" version = "0.13.5" @@ -6435,6 +6465,12 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "protobuf" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" + [[package]] name = "psm" version = "0.1.30" @@ -6469,7 +6505,7 @@ dependencies = [ "quinn-udp", "rustc-hash 2.1.1", "rustls 0.23.28", - "socket2 0.5.10", + "socket2 0.6.3", "thiserror 2.0.18", "tokio", "tracing", @@ -6508,7 +6544,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.5.10", + "socket2 0.6.3", "tracing", "windows-sys 0.60.2", ] @@ -7007,7 +7043,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.4.15", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -7664,11 +7700,17 @@ dependencies = [ "diesel-adapter", "fred", "futures-util", + "humantime", "inventory", "juspay_diesel", "log", "once_cell", "openidconnect", + "opentelemetry", + "opentelemetry-otlp", + "opentelemetry-prometheus", + "opentelemetry_sdk", + "prometheus", "rand 0.8.5", "regex", "reqwest 0.11.27", @@ -7680,6 +7722,7 @@ dependencies = [ "superposition_derives", "superposition_macros", "superposition_types", + "thiserror 1.0.58", "tokio", "tracing", "tracing-actix-web", @@ -7925,7 +7968,6 @@ dependencies = [ "cfg-if", "libc", "psm", - "windows-sys 0.52.0", "windows-sys 0.59.0", ] @@ -8033,6 +8075,7 @@ dependencies = [ "leptos", "leptos_actix", "log", + "opentelemetry", "regex", "reqwest 0.11.27", "rs-snowflake", @@ -8042,6 +8085,7 @@ dependencies = [ "superposition_derives", "superposition_macros", "superposition_types", + "tokio", "tracing", "tracing-actix-web", "tracing-subscriber", diff --git a/Cargo.toml b/Cargo.toml index 48423bf56..4e04bd08f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -63,6 +63,7 @@ diesel = { version = "2.2.4", package = "juspay_diesel", features = [ ] } fred = { version = "9.2.1" } futures-util = "0.3.28" +humantime = "2.1" inventory = "0.3" itertools = { version = "0.10.5" } jsonlogic = { version = "0.5.5", package = "juspay_jsonlogic" } @@ -70,6 +71,11 @@ jsonschema = "~0.17" leptos = { version = "0.6.11" } log = { version = "0.4.20", features = ["kv_unstable_serde"] } once_cell = { version = "1.18.0" } +opentelemetry = { version = "0.27", default-features = false, features = ["metrics"] } +opentelemetry_sdk = { version = "0.27", default-features = false, features = ["metrics", "rt-tokio"] } +opentelemetry-prometheus = { version = "0.27", default-features = false } +opentelemetry-otlp = { version = "0.27", default-features = false, features = ["metrics", "http-proto", "reqwest-client"] } +prometheus = { version = "0.13", default-features = false } regex = "1.9.1" reqwest = { version = "0.11.18", features = ["json"] } rs-snowflake = "0.6.0" @@ -79,6 +85,7 @@ serde_json = { version = "1.0.140" } secrecy = "0.10" strum = "0.25" strum_macros = "0.25" +thiserror = "1" tokio = { version = "1.29.1", features = ["full"] } toml = { version = "0.8.8", features = ["preserve_order"] } tracing = "0.1.44" diff --git a/README.md b/README.md index 01fb6de24..97a4ca457 100644 --- a/README.md +++ b/README.md @@ -150,6 +150,14 @@ Want a broader systems view? Open the [DeepWiki architecture guide](https://deep - [Context7 LLM-friendly docs](https://context7.com/juspay/superposition) - [DeepWiki repository guide](https://deepwiki.com/juspay/superposition) +## Metrics & observability + +The HTTP API exposes Prometheus metrics on `SUPERPOSITION_METRICS_PORT` (default `9091`): + +```bash +curl http://localhost:9091/metrics +``` + ## Contributing We welcome contributions across the platform, clients, docs, and examples. diff --git a/crates/experimentation_platform/src/api/experiment_groups/handlers.rs b/crates/experimentation_platform/src/api/experiment_groups/handlers.rs index fd132adbf..6176bd38a 100644 --- a/crates/experimentation_platform/src/api/experiment_groups/handlers.rs +++ b/crates/experimentation_platform/src/api/experiment_groups/handlers.rs @@ -64,9 +64,9 @@ use crate::api::{ experiments::{ cac_api::validate_context, helpers::{ - validate_change_reason_with_function, hash, - validate_and_add_experiment_group_id, + hash, validate_and_add_experiment_group_id, validate_and_remove_experiment_group_id, + validate_change_reason_with_function, }, }, }; diff --git a/crates/experimentation_platform/src/api/experiments/handlers.rs b/crates/experimentation_platform/src/api/experiments/handlers.rs index ee62108e5..9c9aafb2b 100644 --- a/crates/experimentation_platform/src/api/experiments/handlers.rs +++ b/crates/experimentation_platform/src/api/experiments/handlers.rs @@ -89,9 +89,9 @@ use crate::api::{ }, experiments::{ helpers::{ - validate_change_reason_with_function, get_control_overrides_from_exp_id, put_experiments_in_redis, - validate_control_overrides, validate_delete_experiment_variants, + validate_change_reason_with_function, validate_control_overrides, + validate_delete_experiment_variants, }, types::StartedByChangeSet, }, @@ -170,7 +170,7 @@ async fn create_handler( &workspace_context, &change_reason, &state, - &user + &user, ) .await?; @@ -462,7 +462,7 @@ async fn conclude_handler( &workspace_context, &req.change_reason, &state, - &user + &user, ) .await?; @@ -745,7 +745,7 @@ async fn discard_handler( &workspace_context, &req.change_reason, &state, - &user + &user, ) .await?; @@ -1359,7 +1359,7 @@ async fn ramp_handler( &workspace_context, &change_reason, &state, - &user + &user, ) .await?; @@ -1568,7 +1568,7 @@ async fn update_handler( &workspace_context, &change_reason, &state, - &user + &user, ) .await?; @@ -1907,7 +1907,7 @@ async fn pause_handler( &workspace_context, &req.change_reason, &state, - &user + &user, ) .await?; @@ -2004,7 +2004,7 @@ async fn resume_handler( &workspace_context, &req.change_reason, &state, - &user + &user, ) .await?; diff --git a/crates/experimentation_platform/src/api/experiments/helpers.rs b/crates/experimentation_platform/src/api/experiments/helpers.rs index e964c2063..08e1537dc 100644 --- a/crates/experimentation_platform/src/api/experiments/helpers.rs +++ b/crates/experimentation_platform/src/api/experiments/helpers.rs @@ -790,7 +790,8 @@ pub async fn validate_change_reason_with_function( change_reason: change_reason.clone(), }; - let headers_map = construct_header_map(workspace_context, vec![("x-user", user_str)])?; + let headers_map = + construct_header_map(workspace_context, vec![("x-user", user_str)])?; let response = http_client .post(&url) diff --git a/crates/service_utils/Cargo.toml b/crates/service_utils/Cargo.toml index ad00d2a55..6293e4065 100644 --- a/crates/service_utils/Cargo.toml +++ b/crates/service_utils/Cargo.toml @@ -21,10 +21,16 @@ diesel = { workspace = true } diesel-adapter = { version = "1.2.0" } fred = { workspace = true, features = ["metrics"] } futures-util = { workspace = true } +humantime = { workspace = true } inventory = { workspace = true } log = { workspace = true } once_cell = { workspace = true } openidconnect = "3.5.0" +opentelemetry = { workspace = true } +opentelemetry_sdk = { workspace = true } +opentelemetry-otlp = { workspace = true } +opentelemetry-prometheus = { workspace = true } +prometheus = { workspace = true } rand = "0.8" tokio = { workspace = true } tracing = { workspace = true } @@ -42,6 +48,7 @@ superposition_types = { workspace = true, features = [ "api", "diesel_derives", ] } +thiserror = { workspace = true } url = { workspace = true } urlencoding = "~2.1.2" uuid = {workspace = true} diff --git a/crates/service_utils/src/lib.rs b/crates/service_utils/src/lib.rs index d8cf11600..686934474 100644 --- a/crates/service_utils/src/lib.rs +++ b/crates/service_utils/src/lib.rs @@ -1,10 +1,15 @@ #![deny(unused_crate_dependencies)] +// opentelemetry_otlp is only used in cfg(not(test)) code; suppress the lint +// when compiling tests. +#[cfg(test)] +use opentelemetry_otlp as _; pub mod aws; pub mod db; pub mod encryption; pub mod extensions; pub mod helpers; pub mod middlewares; +pub mod observability; pub mod redis; pub mod registry; pub mod service; diff --git a/crates/service_utils/src/observability.rs b/crates/service_utils/src/observability.rs new file mode 100644 index 000000000..3f2d72590 --- /dev/null +++ b/crates/service_utils/src/observability.rs @@ -0,0 +1,244 @@ +//! HTTP golden-signals metrics exposition via OpenTelemetry. +//! +//! See `docs/superpowers/specs/2026-05-10-otel-golden-signals-middleware-design.md`. + +mod config; +mod meters; +mod metrics_server; +mod middleware; +mod saturation; + +pub use config::{LabelConfig, ObservabilityConfig}; +pub use meters::HttpMeters; +pub use metrics_server::spawn_metrics_server; +pub use middleware::MetricsMiddleware; +pub use saturation::{ + DbPoolHandle, FredPoolStats, RedisHandle, RedisStats, SaturationDeps, + register_observers, +}; + +use std::sync::Arc; + +use opentelemetry::metrics::Meter; +use opentelemetry_sdk::metrics::SdkMeterProvider; +use prometheus::Registry; +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum ObservabilityError { + #[error("prometheus exporter init failed: {0}")] + PrometheusInit(String), + #[error("otlp exporter init failed: {0}")] + OtlpInit(String), + #[error("config error: {0}")] + Config(String), + #[error("meter provider shutdown failed: {0}")] + Shutdown(String), + #[error(transparent)] + Io(#[from] std::io::Error), +} + +pub struct Observability { + provider: SdkMeterProvider, + registry: Arc, + meter: Meter, +} + +impl Observability { + pub fn meter(&self) -> Meter { + self.meter.clone() + } + + pub fn registry(&self) -> Arc { + self.registry.clone() + } + + pub fn shutdown(self) -> Result<(), ObservabilityError> { + self.provider + .shutdown() + .map_err(|e| ObservabilityError::Shutdown(e.to_string())) + } + + pub fn init(cfg: ObservabilityConfig) -> Result { + use opentelemetry::KeyValue; + use opentelemetry_sdk::Resource; + use opentelemetry_sdk::metrics::SdkMeterProvider; + + let registry = Arc::new(prometheus::Registry::new()); + + let exporter = opentelemetry_prometheus::exporter() + .with_registry((*registry).clone()) + .without_scope_info() + .build() + .map_err(|e| ObservabilityError::PrometheusInit(e.to_string()))?; + + let mut resource_attrs = vec![ + KeyValue::new("service.name", cfg.service_name.clone()), + KeyValue::new("service.version", cfg.service_version.clone()), + KeyValue::new("service.instance.id", cfg.instance_id.clone()), + ]; + if let Some(env) = &cfg.deployment_environment { + resource_attrs.push(KeyValue::new("deployment.environment", env.clone())); + } + + // §8.5 — merge OTEL_RESOURCE_ATTRIBUTES ("k1=v1,k2=v2,...") if set. + // Keys and values are percent-encoded per W3C baggage / OTel spec; decode before use. + if let Ok(extra) = std::env::var("OTEL_RESOURCE_ATTRIBUTES") { + for pair in extra.split(',') { + if let Some((k, v)) = pair.split_once('=') { + let k = urlencoding::decode(k.trim()) + .unwrap_or_else(|_| k.trim().into()) + .into_owned(); + let v = urlencoding::decode(v.trim()) + .unwrap_or_else(|_| v.trim().into()) + .into_owned(); + if !k.is_empty() { + resource_attrs.push(KeyValue::new(k, v)); + } + } + } + } + + let resource = Resource::new(resource_attrs); + + let mut builder = SdkMeterProvider::builder() + .with_reader(exporter) + .with_resource(resource.clone()); + + if let Some(endpoint) = &cfg.otlp_endpoint { + match with_otlp_reader(builder, endpoint, cfg.collect_interval) { + Ok(b) => builder = b, + Err(e) => { + tracing::warn!( + error = %e, + endpoint = %endpoint, + "OTLP exporter init failed; metrics will be exposed via /metrics only", + ); + // Rebuild Prom-only builder (base was consumed by with_otlp_reader). + let prom_exporter = opentelemetry_prometheus::exporter() + .with_registry((*registry).clone()) + .without_scope_info() + .build() + .map_err(|e| ObservabilityError::PrometheusInit(e.to_string()))?; + builder = SdkMeterProvider::builder() + .with_reader(prom_exporter) + .with_resource(resource); + } + } + } + + let provider = builder.build(); + opentelemetry::global::set_meter_provider(provider.clone()); + let meter = { + use opentelemetry::metrics::MeterProvider as _; + provider.meter("superposition") + }; + + Ok(Self { + provider, + registry, + meter, + }) + } +} + +#[cfg(not(test))] +fn with_otlp_reader( + builder: opentelemetry_sdk::metrics::MeterProviderBuilder, + endpoint: &str, + interval: std::time::Duration, +) -> Result { + // Warn if the operator requested a protocol we do not support (gRPC). + // This binary is compiled with `http-proto` only; `grpc` silently falls + // back to HTTP, which can mask misconfiguration. + if let Ok(protocol) = std::env::var("OTEL_EXPORTER_OTLP_PROTOCOL") { + if !protocol.is_empty() && protocol != "http/protobuf" { + tracing::warn!( + requested_protocol = %protocol, + "OTEL_EXPORTER_OTLP_PROTOCOL set to '{}'; only 'http/protobuf' is supported in v1, using HTTP", + protocol + ); + } + } + + // Headers: the opentelemetry-otlp 0.27 HTTP exporter reads + // `OTEL_EXPORTER_OTLP_HEADERS` (and `OTEL_EXPORTER_OTLP_METRICS_HEADERS`) + // automatically during `build()` — no explicit wiring needed here. + use opentelemetry_otlp::{MetricExporter, WithExportConfig}; + use opentelemetry_sdk::metrics::PeriodicReader; + use opentelemetry_sdk::runtime; + + let exporter = MetricExporter::builder() + .with_http() + .with_endpoint(endpoint.to_owned()) + .build() + .map_err(|e| ObservabilityError::OtlpInit(e.to_string()))?; + + let reader = PeriodicReader::builder(exporter, runtime::Tokio) + .with_interval(interval) + .build(); + + Ok(builder.with_reader(reader)) +} + +#[cfg(test)] +fn with_otlp_reader( + builder: opentelemetry_sdk::metrics::MeterProviderBuilder, + _endpoint: &str, + _interval: std::time::Duration, +) -> Result { + // OTLP exporter requires a tokio runtime; we don't spin one up in unit tests. + Ok(builder) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_cfg() -> ObservabilityConfig { + ObservabilityConfig { + enabled: true, + bind: "127.0.0.1".parse().unwrap(), + port: 0, + label: LabelConfig::default(), + collect_interval: std::time::Duration::from_secs(10), + instance_id: "test".into(), + service_name: "sp-test".into(), + service_version: "0.0.0-test".into(), + deployment_environment: None, + otlp_endpoint: None, + } + } + + #[test] + fn init_builds_meter_and_registry() { + let obs = Observability::init(test_cfg()).expect("init failed"); + let _meter = obs.meter(); + let registry = obs.registry(); + let families = registry.gather(); + assert_eq!( + families.len(), + 1, + "only target_info should be present before any instrument records" + ); + assert_eq!(families[0].get_name(), "target_info"); + } + + #[test] + fn meter_can_record_a_histogram_and_register_it_in_registry() { + let obs = Observability::init(test_cfg()).unwrap(); + let meter = obs.meter(); + let h = meter.f64_histogram("test.duration").with_unit("s").build(); + h.record(0.123, &[]); + + let mut buf = Vec::new(); + let encoder = prometheus::TextEncoder::new(); + let metric_families = obs.registry().gather(); + prometheus::Encoder::encode(&encoder, &metric_families, &mut buf).unwrap(); + let text = String::from_utf8(buf).unwrap(); + assert!( + text.contains("test_duration"), + "expected test_duration in exposition, got:\n{text}" + ); + } +} diff --git a/crates/service_utils/src/observability/config.rs b/crates/service_utils/src/observability/config.rs new file mode 100644 index 000000000..79bccfb40 --- /dev/null +++ b/crates/service_utils/src/observability/config.rs @@ -0,0 +1,194 @@ +//! Configuration for the observability subsystem, parsed from env vars. + +use std::{net::IpAddr, str::FromStr, time::Duration}; + +#[derive(Debug, Clone)] +pub struct ObservabilityConfig { + pub enabled: bool, + pub bind: IpAddr, + pub port: u16, + pub label: LabelConfig, + pub collect_interval: Duration, + pub instance_id: String, + pub service_name: String, + pub service_version: String, + pub deployment_environment: Option, + pub otlp_endpoint: Option, +} + +#[derive(Debug, Clone, Copy)] +pub struct LabelConfig { + pub with_org_label: bool, + pub with_workspace_label: bool, +} + +impl Default for LabelConfig { + fn default() -> Self { + Self { + with_org_label: true, + with_workspace_label: true, + } + } +} + +/// Source of env-var values: `(key) -> Some(value) | None`. +/// +/// Marker trait with a blanket impl over every closure / fn-pointer that +/// matches the underlying `Fn` signature. Lets `from_source` and its +/// helpers share one named bound instead of repeating +/// `Fn(&str) -> Option` at every site. Crate-private — the +/// public config-loading API is `from_env`. +pub(crate) trait EnvSource: Fn(&str) -> Option {} +impl Option> EnvSource for F {} + +impl ObservabilityConfig { + /// Parse from the process environment via `std::env::var`. + pub fn from_env() -> Result { + Self::from_source(|k| std::env::var(k).ok()) + } + + /// Generic over the env source for testability. + /// + /// `get(key)` returns `Some(value)` when the key is set, `None` when absent. + /// This keeps tests pure (no process-global env mutations) and parallel-safe. + pub(crate) fn from_source(get: F) -> Result { + /// Parse `key` via `T: FromStr`, falling back to `default_str` when + /// the key is absent. Folds the lookup, the default, and the error + /// label into one place so each env key appears exactly once at the + /// call site. + fn parse_or_default( + get: &impl EnvSource, + key: &'static str, + default_str: &str, + ) -> Result + where + T: FromStr, + T::Err: std::fmt::Display, + { + let raw = get(key).unwrap_or_else(|| default_str.to_owned()); + T::from_str(&raw).map_err(|e| format!("{key}: {e}")) + } + + fn get_opt(get: &impl EnvSource, key: &str) -> Option { + get(key).filter(|s| !s.is_empty()) + } + + let enabled: bool = + parse_or_default(&get, "SUPERPOSITION_METRICS_ENABLED", "true")?; + let bind: IpAddr = + parse_or_default(&get, "SUPERPOSITION_METRICS_BIND", "0.0.0.0")?; + let port: u16 = parse_or_default(&get, "SUPERPOSITION_METRICS_PORT", "9091")?; + let with_org_label: bool = + parse_or_default(&get, "SUPERPOSITION_METRICS_LABEL_ORG", "true")?; + let with_workspace_label: bool = + parse_or_default(&get, "SUPERPOSITION_METRICS_LABEL_WORKSPACE", "true")?; + // humantime::Duration is a newtype around std::time::Duration that + // implements FromStr ("10s", "1m30s", "500ms"); convert back to the + // struct's std::time::Duration field after parsing. + let collect_interval: humantime::Duration = + parse_or_default(&get, "SUPERPOSITION_METRICS_COLLECT_INTERVAL", "10s")?; + let collect_interval: Duration = collect_interval.into(); + + // instance_id: env var takes precedence, then /etc/hostname, then "unknown". + let instance_id = get_opt(&get, "SUPERPOSITION_INSTANCE_ID") + .or_else(hostname_or_none) + .unwrap_or_else(|| "unknown".to_owned()); + + // service.name: OTEL standard env var. String: FromStr + // means the `?` is a noop, but the call shape stays consistent with the + // parsed fields above. + let service_name: String = + parse_or_default(&get, "OTEL_SERVICE_NAME", "superposition")?; + + // service.version: always the build-time crate version. + let service_version = env!("CARGO_PKG_VERSION").to_owned(); + + let deployment_environment = + get_opt(&get, "APP_ENV").or_else(|| get_opt(&get, "DEPLOYMENT_ENV")); + + let otlp_endpoint = get_opt(&get, "OTEL_EXPORTER_OTLP_ENDPOINT"); + + Ok(Self { + enabled, + bind, + port, + label: LabelConfig { + with_org_label, + with_workspace_label, + }, + collect_interval, + instance_id, + service_name, + service_version, + deployment_environment, + otlp_endpoint, + }) + } +} + +fn hostname_or_none() -> Option { + // Avoid pulling in a hostname crate; read /etc/hostname on Linux/macOS. + std::fs::read_to_string("/etc/hostname") + .ok() + .map(|s| s.trim().to_owned()) + .filter(|s| !s.is_empty()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + + fn lookup(map: HashMap<&str, &str>) -> impl Fn(&str) -> Option { + let owned: HashMap = map + .into_iter() + .map(|(k, v)| (k.to_owned(), v.to_owned())) + .collect(); + move |k| owned.get(k).cloned() + } + + #[test] + fn defaults_when_unset() { + let cfg = ObservabilityConfig::from_source(|_| None).unwrap(); + assert!(cfg.enabled); + assert_eq!(cfg.port, 9091); + assert_eq!(cfg.bind.to_string(), "0.0.0.0"); + assert!(cfg.label.with_org_label); + assert!(cfg.label.with_workspace_label); + assert_eq!(cfg.collect_interval, Duration::from_secs(10)); + assert_eq!(cfg.service_name, "superposition"); + assert_eq!(cfg.otlp_endpoint, None); + } + + #[test] + fn explicit_overrides() { + let cfg = ObservabilityConfig::from_source(lookup(HashMap::from([ + ("SUPERPOSITION_METRICS_ENABLED", "false"), + ("SUPERPOSITION_METRICS_PORT", "9999"), + ("SUPERPOSITION_METRICS_BIND", "127.0.0.1"), + ("SUPERPOSITION_METRICS_LABEL_WORKSPACE", "false"), + ("SUPERPOSITION_METRICS_COLLECT_INTERVAL", "30s"), + ("OTEL_EXPORTER_OTLP_ENDPOINT", "http://collector:4318"), + ("OTEL_SERVICE_NAME", "sp-test"), + ]))) + .unwrap(); + assert!(!cfg.enabled); + assert_eq!(cfg.port, 9999); + assert_eq!(cfg.bind.to_string(), "127.0.0.1"); + assert!(cfg.label.with_org_label); // default still true + assert!(!cfg.label.with_workspace_label); + assert_eq!(cfg.collect_interval, Duration::from_secs(30)); + assert_eq!(cfg.otlp_endpoint.as_deref(), Some("http://collector:4318")); + assert_eq!(cfg.service_name, "sp-test"); + } + + #[test] + fn malformed_port_errors() { + let err = ObservabilityConfig::from_source(lookup(HashMap::from([( + "SUPERPOSITION_METRICS_PORT", + "not-a-number", + )]))) + .unwrap_err(); + assert!(err.contains("SUPERPOSITION_METRICS_PORT")); + } +} diff --git a/crates/service_utils/src/observability/meters.rs b/crates/service_utils/src/observability/meters.rs new file mode 100644 index 000000000..e5a8713bf --- /dev/null +++ b/crates/service_utils/src/observability/meters.rs @@ -0,0 +1,44 @@ +//! Typed handles for the metric instruments emitted by the HTTP middleware. + +use opentelemetry::metrics::{Counter, Histogram, Meter, UpDownCounter}; + +/// Histogram + counter + gauge for HTTP server golden signals. Built once at +/// startup and cloned cheaply; instruments are `Arc<>`-backed internally. +#[derive(Clone)] +pub struct HttpMeters { + pub request_duration: Histogram, + pub busy_duration: Counter, + pub active_requests: UpDownCounter, +} + +impl HttpMeters { + pub fn new(meter: &Meter) -> Self { + let request_duration = meter + .f64_histogram("http.server.request.duration") + .with_unit("s") + .with_description("Duration of HTTP server requests, in seconds.") + .with_boundaries(vec![0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]) + .build(); + + let busy_duration = meter + .f64_counter("http.server.busy.duration") + .with_unit("s") + .with_description( + "Cumulative seconds spent serving HTTP requests; \ + rate() over a window gives time-averaged request concurrency.", + ) + .build(); + + let active_requests = meter + .i64_up_down_counter("http.server.active_requests") + .with_unit("{request}") + .with_description("Number of HTTP server requests currently in flight.") + .build(); + + Self { + request_duration, + busy_duration, + active_requests, + } + } +} diff --git a/crates/service_utils/src/observability/metrics_server.rs b/crates/service_utils/src/observability/metrics_server.rs new file mode 100644 index 000000000..5ad3c3c84 --- /dev/null +++ b/crates/service_utils/src/observability/metrics_server.rs @@ -0,0 +1,63 @@ +//! Separate HttpServer that exposes /metrics on SUPERPOSITION_METRICS_PORT. + +use std::{net::SocketAddr, sync::Arc}; + +use actix_web::{App, HttpResponse, HttpServer, dev::Server, web}; +use prometheus::{Encoder, Registry, TextEncoder}; + +/// Spawn an HttpServer on `bind` whose only route is `GET /metrics`. Returns +/// the actix `Server` handle so the caller can `await` it concurrently with +/// the main app. +pub fn spawn_metrics_server( + registry: Arc, + bind: SocketAddr, +) -> std::io::Result { + let registry_data = web::Data::new(registry); + Ok(HttpServer::new(move || { + App::new() + .app_data(registry_data.clone()) + .route("/metrics", web::get().to(scrape)) + }) + .workers(1) + .bind(bind)? + .run()) +} + +async fn scrape(registry: web::Data>) -> HttpResponse { + let encoder = TextEncoder::new(); + let metric_families = registry.gather(); + let mut buf = Vec::new(); + if let Err(e) = encoder.encode(&metric_families, &mut buf) { + return HttpResponse::InternalServerError().body(format!("encode error: {e}")); + } + HttpResponse::Ok() + .content_type(encoder.format_type()) + .body(buf) +} + +#[cfg(test)] +mod tests { + use super::*; + use actix_web::{App, http::StatusCode, test}; + + #[actix_web::test] + async fn scrape_endpoint_returns_text_plain() { + let registry = Arc::new(Registry::new()); + let app = test::init_service( + App::new() + .app_data(web::Data::new(registry.clone())) + .route("/metrics", web::get().to(scrape)), + ) + .await; + let req = test::TestRequest::get().uri("/metrics").to_request(); + let resp = test::call_service(&app, req).await; + assert_eq!(resp.status(), StatusCode::OK); + let ct = resp + .headers() + .get("content-type") + .unwrap() + .to_str() + .unwrap(); + assert!(ct.starts_with("text/plain"), "got {ct}"); + } +} diff --git a/crates/service_utils/src/observability/middleware.rs b/crates/service_utils/src/observability/middleware.rs new file mode 100644 index 000000000..11b76e44f --- /dev/null +++ b/crates/service_utils/src/observability/middleware.rs @@ -0,0 +1,457 @@ +//! Actix middleware that records OpenTelemetry HTTP server metrics. + +use std::future::{Ready, ready}; +use std::rc::Rc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::Instant; + +use actix_web::{ + Error, HttpMessage, + body::MessageBody, + dev::{Service, ServiceRequest, ServiceResponse, Transform, forward_ready}, +}; +use futures_util::future::LocalBoxFuture; +use opentelemetry::KeyValue; +use opentelemetry::metrics::{Meter, UpDownCounter}; + +use crate::observability::config::LabelConfig; +use crate::observability::meters::HttpMeters; +use crate::service::types::{OrganisationId, WorkspaceId}; + +/// Per OpenTelemetry HTTP semantic conventions, only known methods get their +/// literal name; anything else collapses to `_OTHER`. Prevents weirdo clients +/// from blowing up the cardinality of the `http.request.method` attribute. +pub(crate) fn normalize_method(m: &actix_web::http::Method) -> &'static str { + macro_rules! match_known { + ($val:expr, [$($name:literal),+ $(,)?], $other:literal) => { + match $val { $($name => $name,)+ _ => $other } + }; + } + match_known!( + m.as_str(), + ["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "TRACE", "CONNECT"], + "_OTHER" + ) +} + +/// Sentinel for paths that did not match any registered route (would 404). +pub(crate) const ROUTE_NOT_FOUND: &str = "__not_found__"; + +/// Sentinel for static-asset routes (pkg, assets, favicon). Collapsing these +/// prevents one cardinality series per unique path tail. +pub(crate) const ROUTE_STATIC: &str = "__static__"; + +/// Route-pattern prefixes that identify static-asset serving routes. +/// Any `match_pattern()` that starts with one of these is collapsed to +/// `ROUTE_STATIC` to keep `http.route` cardinality bounded. +const STATIC_PATTERN_PREFIXES: &[&str] = &["/pkg", "/assets", "/favicon"]; + +/// Returns `true` when `pattern` belongs to a static-asset route. +pub(crate) fn is_static_pattern(pattern: &str) -> bool { + STATIC_PATTERN_PREFIXES + .iter() + .any(|prefix| pattern.starts_with(prefix)) +} + +/// Extracts the templated route pattern from a `ServiceRequest`. +/// Falls back to `ROUTE_NOT_FOUND` when no route matched; +/// collapses static patterns to `ROUTE_STATIC`. +/// +/// This is the request-phase variant. In production the middleware uses +/// [`extract_route_from_response`] (response phase) because route matching is +/// only complete after the inner service has run. This function is available +/// for callers that have a live `ServiceRequest` (e.g. future request-scoped +/// middleware). +#[allow(dead_code)] +pub(crate) fn extract_route(req: &ServiceRequest) -> String { + match req.match_pattern() { + None => ROUTE_NOT_FOUND.to_owned(), + Some(p) if is_static_pattern(&p) => ROUTE_STATIC.to_owned(), + Some(p) => p, + } +} + +/// Same logic as `extract_route` but operates on a completed `ServiceResponse` +/// (available in the middleware's response phase). +pub(crate) fn extract_route_from_response(res: &ServiceResponse) -> String { + match res.request().match_pattern() { + None => ROUTE_NOT_FOUND.to_owned(), + Some(p) if is_static_pattern(&p) => ROUTE_STATIC.to_owned(), + Some(p) => p, + } +} + +/// Build the OTel attributes set for a single HTTP request. Reads org_id / +/// workspace_id from request extensions if `OrgWorkspaceMiddlewareFactory` +/// has populated them; otherwise omits those attributes entirely (rather +/// than emitting an empty string, which would create a distinct series). +pub(crate) fn build_attributes( + method: &'static str, + route: &str, + status_code: u16, + org_id: Option<&str>, + workspace: Option<&str>, + label_cfg: &LabelConfig, +) -> Vec { + let mut attrs = Vec::with_capacity(5); + attrs.push(KeyValue::new("http.request.method", method)); + attrs.push(KeyValue::new("http.route", route.to_owned())); + attrs.push(KeyValue::new( + "http.response.status_code", + status_code as i64, + )); + if label_cfg.with_org_label { + if let Some(o) = org_id { + attrs.push(KeyValue::new("sp.org_id", o.to_owned())); + } + } + if label_cfg.with_workspace_label { + if let Some(w) = workspace { + attrs.push(KeyValue::new("sp.workspace_id", w.to_owned())); + } + } + attrs +} + +/// RAII guard that decrements `http.server.active_requests` on Drop unless +/// `release()` was called. Ensures a panicking handler still decrements the +/// gauge. +#[must_use = "dropping InFlightGuard immediately negates the in-flight window"] +pub(crate) struct InFlightGuard { + counter: UpDownCounter, + method: &'static str, + decremented: AtomicBool, +} + +impl InFlightGuard { + pub(crate) fn enter(counter: UpDownCounter, method: &'static str) -> Self { + counter.add(1, &[KeyValue::new("http.request.method", method)]); + Self { + counter, + method, + decremented: AtomicBool::new(false), + } + } + + pub(crate) fn release(&self) { + if !self.decremented.swap(true, Ordering::Relaxed) { + self.counter + .add(-1, &[KeyValue::new("http.request.method", self.method)]); + } + } +} + +impl Drop for InFlightGuard { + fn drop(&mut self) { + self.release(); + } +} + +#[derive(Clone)] +pub struct MetricsMiddleware { + meters: HttpMeters, + label_cfg: LabelConfig, +} + +impl MetricsMiddleware { + pub fn new(meter: &Meter, label_cfg: LabelConfig) -> Self { + Self { + meters: HttpMeters::new(meter), + label_cfg, + } + } +} + +impl Transform for MetricsMiddleware +where + S: Service, Error = Error> + 'static, + S::Future: 'static, + B: MessageBody + 'static, +{ + type Response = ServiceResponse; + type Error = Error; + type InitError = (); + type Transform = MetricsMiddlewareImpl; + type Future = Ready>; + + fn new_transform(&self, service: S) -> Self::Future { + ready(Ok(MetricsMiddlewareImpl { + service: Rc::new(service), + meters: self.meters.clone(), + label_cfg: self.label_cfg, + })) + } +} + +pub struct MetricsMiddlewareImpl { + service: Rc, + meters: HttpMeters, + label_cfg: LabelConfig, +} + +impl Service for MetricsMiddlewareImpl +where + S: Service, Error = Error> + 'static, + S::Future: 'static, + B: MessageBody + 'static, +{ + type Response = ServiceResponse; + type Error = Error; + type Future = LocalBoxFuture<'static, Result>; + + forward_ready!(service); + + fn call(&self, req: ServiceRequest) -> Self::Future { + let service = self.service.clone(); + let meters = self.meters.clone(); + let label_cfg = self.label_cfg; + + let method_normalized = normalize_method(req.method()); + let start = Instant::now(); + let guard = + InFlightGuard::enter(meters.active_requests.clone(), method_normalized); + + Box::pin(async move { + let result = service.call(req).await; + let elapsed = start.elapsed().as_secs_f64(); + + match &result { + Ok(res) => { + let route = extract_route_from_response(res); + let status = res.status().as_u16(); + let extensions = res.request().extensions(); + let org = extensions.get::().map(|o| o.0.clone()); + let ws = extensions.get::().map(|w| w.0.clone()); + drop(extensions); + + let attrs = build_attributes( + method_normalized, + &route, + status, + org.as_deref(), + ws.as_deref(), + &label_cfg, + ); + meters.request_duration.record(elapsed, &attrs); + meters.busy_duration.add( + elapsed, + &[KeyValue::new("http.request.method", method_normalized)], + ); + } + Err(err) => { + // The request was consumed by `service.call`, so + // `match_pattern` is no longer accessible. Route stays + // `ROUTE_NOT_FOUND`. We do extract the real HTTP status + // from the error's response rather than blindly using 500. + let status = err.error_response().status().as_u16(); + let attrs = build_attributes( + method_normalized, + ROUTE_NOT_FOUND, + status, + None, + None, + &label_cfg, + ); + meters.request_duration.record(elapsed, &attrs); + // The request still consumed worker time; count it. + meters.busy_duration.add( + elapsed, + &[KeyValue::new("http.request.method", method_normalized)], + ); + } + } + + guard.release(); + result + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use actix_web::http::Method; + + #[test] + fn known_methods_pass_through() { + for (m, expected) in [ + (Method::GET, "GET"), + (Method::POST, "POST"), + (Method::PUT, "PUT"), + (Method::DELETE, "DELETE"), + (Method::PATCH, "PATCH"), + (Method::HEAD, "HEAD"), + (Method::OPTIONS, "OPTIONS"), + (Method::TRACE, "TRACE"), + (Method::CONNECT, "CONNECT"), + ] { + assert_eq!(normalize_method(&m), expected); + } + } + + #[test] + fn unknown_methods_collapse_to_other() { + let m = Method::from_bytes(b"XPROPFIND").unwrap(); + assert_eq!(normalize_method(&m), "_OTHER"); + let m = Method::from_bytes(b"WEIRDO").unwrap(); + assert_eq!(normalize_method(&m), "_OTHER"); + } + + use actix_web::{App, HttpResponse, http::StatusCode, test as actix_test, web}; + + #[test] + fn extract_route_helper_handles_static_paths() { + assert!(is_static_pattern("/pkg/{tail:.*}")); + assert!(is_static_pattern("/assets/{tail:.*}")); + assert!(is_static_pattern("/favicon.ico")); + assert!(!is_static_pattern("/contexts/{id}")); + assert!(!is_static_pattern("/health")); + } + + #[actix_web::test] + async fn matched_route_setup_smoke() { + let app = actix_test::init_service(App::new().route( + "/contexts/{id}", + web::get().to(|| async { HttpResponse::Ok().finish() }), + )) + .await; + let req = actix_test::TestRequest::get() + .uri("/contexts/abc123") + .to_request(); + let resp = actix_test::call_service(&app, req).await; + assert_eq!(resp.status(), StatusCode::OK); + // Note: extract_route is exercised in the integration test in Task 19 + // because match_pattern() is only populated mid-pipeline. This unit-test + // stub is kept for build-coverage of the call site. + } + + use crate::observability::config::LabelConfig; + + #[test] + fn build_attributes_with_all_labels() { + let cfg = LabelConfig { + with_org_label: true, + with_workspace_label: true, + }; + let attrs = build_attributes( + "GET", + "/contexts/{id}", + 200, + Some("org1"), + Some("ws1"), + &cfg, + ); + assert_eq!(attrs.len(), 5); + assert!(attrs.iter().any(|kv| kv.key.as_str() == "sp.org_id")); + assert!(attrs.iter().any(|kv| kv.key.as_str() == "sp.workspace_id")); + } + + #[test] + fn build_attributes_omits_missing_workspace() { + let cfg = LabelConfig { + with_org_label: true, + with_workspace_label: true, + }; + let attrs = build_attributes("POST", "/orgs", 201, Some("org1"), None, &cfg); + assert_eq!(attrs.len(), 4); + assert!(attrs.iter().any(|kv| kv.key.as_str() == "sp.org_id")); + assert!(!attrs.iter().any(|kv| kv.key.as_str() == "sp.workspace_id")); + } + + #[test] + fn build_attributes_respects_disable_flag() { + let cfg = LabelConfig { + with_org_label: false, + with_workspace_label: false, + }; + let attrs = build_attributes("GET", "/x", 200, Some("org1"), Some("ws1"), &cfg); + assert_eq!(attrs.len(), 3); + assert!(!attrs.iter().any(|kv| kv.key.as_str() == "sp.org_id")); + assert!(!attrs.iter().any(|kv| kv.key.as_str() == "sp.workspace_id")); + } + + #[test] + fn guard_decrements_on_drop_only_once() { + use crate::observability::{LabelConfig, Observability, ObservabilityConfig}; + use std::time::Duration; + + let cfg = ObservabilityConfig { + enabled: true, + bind: "127.0.0.1".parse().unwrap(), + port: 0, + label: LabelConfig::default(), + collect_interval: Duration::from_secs(10), + instance_id: "test".into(), + service_name: "sp-test".into(), + service_version: "0".into(), + deployment_environment: None, + otlp_endpoint: None, + }; + let obs = Observability::init(cfg).unwrap(); + let m = obs.meter().i64_up_down_counter("test.in_flight").build(); + + { + let g = InFlightGuard::enter(m.clone(), "GET"); + g.release(); + // Drop after explicit release; should be a no-op. + } + // The guard should tolerate multiple release() calls without panicking + // and a release-then-drop pattern. + let g = InFlightGuard::enter(m.clone(), "POST"); + g.release(); + g.release(); + drop(g); + } + + #[actix_web::test] + async fn middleware_records_request_duration() { + use crate::observability::{Observability, ObservabilityConfig}; + use std::time::Duration; + + let cfg = ObservabilityConfig { + enabled: true, + bind: "127.0.0.1".parse().unwrap(), + port: 0, + label: LabelConfig::default(), + collect_interval: Duration::from_secs(10), + instance_id: "test".into(), + service_name: "sp-test".into(), + service_version: "0".into(), + deployment_environment: None, + otlp_endpoint: None, + }; + let obs = Observability::init(cfg).unwrap(); + let mw = MetricsMiddleware::new(&obs.meter(), LabelConfig::default()); + + use actix_web::{App, HttpResponse, http::StatusCode, web}; + let app = actix_test::init_service(App::new().wrap(mw).route( + "/ping", + web::get().to(|| async { HttpResponse::Ok().body("pong") }), + )) + .await; + + let req = actix_test::TestRequest::get().uri("/ping").to_request(); + let resp = actix_test::call_service(&app, req).await; + assert_eq!(resp.status(), StatusCode::OK); + + let mut buf = Vec::new(); + let metric_families = obs.registry().gather(); + prometheus::Encoder::encode( + &prometheus::TextEncoder::new(), + &metric_families, + &mut buf, + ) + .unwrap(); + let text = String::from_utf8(buf).unwrap(); + assert!( + text.contains("http_server_request_duration_seconds_count"), + "{text}" + ); + assert!( + text.contains("http_server_busy_duration_seconds_total"), + "{text}" + ); + assert!(text.contains("http_server_active_requests"), "{text}"); + assert!(text.contains("http_route=\"/ping\""), "{text}"); + } +} diff --git a/crates/service_utils/src/observability/saturation.rs b/crates/service_utils/src/observability/saturation.rs new file mode 100644 index 000000000..9255e6e31 --- /dev/null +++ b/crates/service_utils/src/observability/saturation.rs @@ -0,0 +1,34 @@ +//! Saturation collectors: DB pool, Redis pool, Tokio runtime. +//! +//! All metrics are observable-instrument callbacks — no background tasks. + +mod db_pool; +mod redis_pool; +mod tokio_runtime; + +use opentelemetry::metrics::Meter; + +pub use db_pool::DbPoolHandle; +pub use redis_pool::{FredPoolStats, RedisHandle, RedisStats}; + +/// Optional dependencies the saturation subsystem can observe. +#[derive(Default, Clone)] +pub struct SaturationDeps { + pub db_pool: Option, + pub redis_client: Option, +} + +pub fn register_observers( + meter: &Meter, + deps: SaturationDeps, +) -> Result<(), super::ObservabilityError> { + if let Some(pool) = deps.db_pool { + db_pool::register(meter, pool, "primary"); + } + if let Some(client) = deps.redis_client { + redis_pool::register(meter, client, "primary"); + } + tokio_runtime::register(meter); + + Ok(()) +} diff --git a/crates/service_utils/src/observability/saturation/db_pool.rs b/crates/service_utils/src/observability/saturation/db_pool.rs new file mode 100644 index 000000000..c3201b86b --- /dev/null +++ b/crates/service_utils/src/observability/saturation/db_pool.rs @@ -0,0 +1,50 @@ +//! ObservableGauge callbacks for the r2d2 connection pool. Purely passive — +//! no instrumentation at `pool.get()` call sites. + +use std::sync::Arc; + +use opentelemetry::{KeyValue, metrics::Meter}; + +/// Concrete pool type used across the codebase. +/// +/// Mirrors `crate::db::PgSchemaConnectionPool` (which aliases +/// `diesel::r2d2::Pool>`). +/// Using an explicit expansion here so the observability subsystem does not +/// take a hard dep on `crate::db` — callers pass the handle in via +/// `SaturationDeps`. +pub type DbPoolHandle = + Arc>>; + +pub fn register(meter: &Meter, pool: DbPoolHandle, pool_name: &'static str) { + let pool_for_usage = pool.clone(); + let usage_pool_name = KeyValue::new("pool.name", pool_name); + meter + .u64_observable_gauge("db.client.connections.usage") + .with_description("Number of DB connections in idle/used state.") + .with_callback(move |observer| { + let s = pool_for_usage.state(); + let used = s.connections.saturating_sub(s.idle_connections); + observer.observe( + s.idle_connections as u64, + &[KeyValue::new("state", "idle"), usage_pool_name.clone()], + ); + observer.observe( + used as u64, + &[KeyValue::new("state", "used"), usage_pool_name.clone()], + ); + }) + .build(); + + let pool_for_max = pool.clone(); + let max_pool_name = KeyValue::new("pool.name", pool_name); + meter + .u64_observable_gauge("db.client.connections.max") + .with_description("Configured maximum size of the DB connection pool.") + .with_callback(move |observer| { + observer.observe( + pool_for_max.max_size() as u64, + std::slice::from_ref(&max_pool_name), + ); + }) + .build(); +} diff --git a/crates/service_utils/src/observability/saturation/redis_pool.rs b/crates/service_utils/src/observability/saturation/redis_pool.rs new file mode 100644 index 000000000..a0576e3b6 --- /dev/null +++ b/crates/service_utils/src/observability/saturation/redis_pool.rs @@ -0,0 +1,91 @@ +//! Saturation gauges for the Redis client pool (fred crate). +//! +//! fred's `metrics` feature exposes per-client / per-pool stats. The +//! callbacks below are intentionally tolerant: if a stat is unavailable +//! in the version we use, the metric is simply not emitted. +//! +//! ## fred 9.2.1 API notes +//! +//! * `ClientLike::is_connected()` returns true when the client's underlying +//! connection to Redis is active. Counting these across the pool gives a +//! useful "healthy connections" gauge. +//! * `MetricsInterface` is implemented on `RedisClient`, **not** on +//! `RedisPool`. The pool exposes `.clients() -> &[RedisClient]` so we can +//! iterate over individual clients. +//! * `command_queue_len()` (via `MetricsInterface`) counts buffered commands +//! waiting to be written to the socket. Summing across pool clients gives a +//! useful "pending work" gauge. + +use std::sync::Arc; + +use fred::{ + interfaces::{ClientLike, MetricsInterface}, + prelude::RedisPool, +}; +use opentelemetry::{KeyValue, metrics::Meter}; + +/// Wraps whatever fred client/pool type the rest of `service_utils` uses. +/// The wrapping type implements `RedisStats` so the observability module +/// does not have to know fred's concrete types. +pub type RedisHandle = Arc; + +/// Thin abstraction over the fred metrics surface so the saturation module +/// is decoupled from fred's exact API. +/// +/// Returning `None` from any getter simply omits the corresponding metric. +/// This is intentional so that not-yet-wired stats do not break the build. +pub trait RedisStats { + /// Number of clients in the pool with an active connection to Redis. + fn connected_connections(&self) -> Option; + fn commands_in_flight(&self) -> Option; +} + +/// Implements `RedisStats` for the project's fred `RedisPool`. +pub struct FredPoolStats(pub RedisPool); + +impl RedisStats for FredPoolStats { + fn connected_connections(&self) -> Option { + Some(self.0.clients().iter().filter(|c| c.is_connected()).count() as u64) + } + + fn commands_in_flight(&self) -> Option { + // `command_queue_len()` is available via `MetricsInterface` on each + // `RedisClient`. It counts commands buffered in the client that have + // not yet been written to the network socket. Summing across all + // pool clients gives an approximate "pending work" measure. + let total: usize = self.0.clients().iter().map(|c| c.command_queue_len()).sum(); + Some(total as u64) + } +} + +pub fn register(meter: &Meter, client: RedisHandle, pool_name: &'static str) { + let pool_label = KeyValue::new("pool.name", pool_name); + + let c = client.clone(); + let label = pool_label.clone(); + meter + .u64_observable_gauge("redis.client.connections.connected") + .with_description( + "Number of Redis client connections currently connected to the server.", + ) + .with_callback(move |observer| { + if let Some(n) = c.connected_connections() { + observer.observe(n, std::slice::from_ref(&label)); + } + }) + .build(); + + let c = client.clone(); + let label = pool_label.clone(); + meter + .u64_observable_gauge("redis.client.commands.in_flight") + .with_description( + "Number of Redis commands currently buffered (waiting to be sent to the server).", + ) + .with_callback(move |observer| { + if let Some(n) = c.commands_in_flight() { + observer.observe(n, std::slice::from_ref(&label)); + } + }) + .build(); +} diff --git a/crates/service_utils/src/observability/saturation/tokio_runtime.rs b/crates/service_utils/src/observability/saturation/tokio_runtime.rs new file mode 100644 index 000000000..597c3b26c --- /dev/null +++ b/crates/service_utils/src/observability/saturation/tokio_runtime.rs @@ -0,0 +1,58 @@ +//! Tokio runtime saturation gauges. +//! +//! Reads `tokio::runtime::Handle::metrics()` directly from each observable +//! callback — no background sampler, no atomics snapshot, no `RuntimeMonitor`. +//! Worker count and global queue depth are stable instantaneous values; total +//! busy time is exposed as a monotonic Counter (per-worker durations summed +//! and reported in seconds), letting Prometheus compute the rate / saturation +//! ratio at query time. +//! +//! No-op when not running on a Tokio runtime. + +use opentelemetry::metrics::Meter; + +pub fn register(meter: &Meter) { + let handle = match tokio::runtime::Handle::try_current() { + Ok(h) => h, + Err(_) => return, + }; + + let h = handle.clone(); + meter + .u64_observable_gauge("runtime.tokio.workers") + .with_description("Number of tokio worker threads.") + .with_callback(move |observer| { + observer.observe(h.metrics().num_workers() as u64, &[]); + }) + .build(); + + let h = handle.clone(); + meter + .u64_observable_gauge("runtime.tokio.global_queue.depth") + .with_description("Tasks queued in the runtime's global injection queue.") + .with_callback(move |observer| { + observer.observe(h.metrics().global_queue_depth() as u64, &[]); + }) + .build(); + + // `worker_total_busy_duration` requires 64-bit atomics; gate the + // instrument the same way tokio gates the method. + #[cfg(target_has_atomic = "64")] + { + let h = handle; + meter + .f64_observable_counter("runtime.tokio.workers.busy.time") + .with_unit("s") + .with_description( + "Cumulative time tokio worker threads have spent busy, summed across workers.", + ) + .with_callback(move |observer| { + let m = h.metrics(); + let total_secs: f64 = (0..m.num_workers()) + .map(|i| m.worker_total_busy_duration(i).as_secs_f64()) + .sum(); + observer.observe(total_secs, &[]); + }) + .build(); + } +} diff --git a/crates/service_utils/tests/observability_integration.rs b/crates/service_utils/tests/observability_integration.rs new file mode 100644 index 000000000..be005390f --- /dev/null +++ b/crates/service_utils/tests/observability_integration.rs @@ -0,0 +1,213 @@ +//! End-to-end test: an Actix app wrapped with MetricsMiddleware serves several +//! routes; we then issue requests and parse the Prometheus scrape output to +//! assert on the metrics that should appear. + +use actix_web::{App, HttpResponse, http::StatusCode, test, web}; +use prometheus::Encoder; +use service_utils::observability::{ + LabelConfig, MetricsMiddleware, Observability, ObservabilityConfig, SaturationDeps, + register_observers, +}; + +fn cfg() -> ObservabilityConfig { + ObservabilityConfig { + enabled: true, + bind: "127.0.0.1".parse().unwrap(), + port: 0, + label: LabelConfig::default(), + collect_interval: std::time::Duration::from_secs(10), + instance_id: "it".into(), + service_name: "sp-it".into(), + service_version: "0".into(), + deployment_environment: None, + otlp_endpoint: None, + } +} + +fn scrape(obs: &Observability) -> String { + let metric_families = obs.registry().gather(); + let mut buf = Vec::new(); + prometheus::TextEncoder::new() + .encode(&metric_families, &mut buf) + .unwrap(); + String::from_utf8(buf).unwrap() +} + +#[actix_web::test] +async fn metrics_appear_after_requests() { + let obs = Observability::init(cfg()).unwrap(); + let mw = MetricsMiddleware::new(&obs.meter(), LabelConfig::default()); + let app = test::init_service( + App::new() + .wrap(mw) + .route( + "/ping", + web::get().to(|| async { HttpResponse::Ok().finish() }), + ) + .route( + "/echo/{name}", + web::post().to(|p: web::Path| async move { + HttpResponse::Created().body(p.into_inner()) + }), + ) + .route( + "/boom", + web::get().to(|| async { HttpResponse::InternalServerError().finish() }), + ), + ) + .await; + + for _ in 0..3 { + let req = test::TestRequest::get().uri("/ping").to_request(); + let resp = test::call_service(&app, req).await; + assert_eq!(resp.status(), StatusCode::OK); + } + let req = test::TestRequest::post().uri("/echo/world").to_request(); + let resp = test::call_service(&app, req).await; + assert_eq!(resp.status(), StatusCode::CREATED); + + let req = test::TestRequest::get().uri("/boom").to_request(); + let resp = test::call_service(&app, req).await; + assert_eq!(resp.status(), StatusCode::INTERNAL_SERVER_ERROR); + + let req = test::TestRequest::get().uri("/no-such-route").to_request(); + let resp = test::call_service(&app, req).await; + assert_eq!(resp.status(), StatusCode::NOT_FOUND); + + let body = scrape(&obs); + + // Request duration histogram exists with expected labels for /ping (3 hits). + let ping_count_line = body + .lines() + .find(|l| { + l.starts_with("http_server_request_duration_seconds_count{") + && l.contains("http_route=\"/ping\"") + && l.contains("http_request_method=\"GET\"") + && l.contains("http_response_status_code=\"200\"") + }) + .unwrap_or_else(|| panic!("no /ping count line in:\n{body}")); + let ping_count: f64 = ping_count_line + .rsplit_once(' ') + .unwrap() + .1 + .trim() + .parse() + .unwrap(); + assert_eq!(ping_count as u64, 3); + + // 5xx series for /boom appears. + assert!( + body.lines().any(|l| { + l.starts_with("http_server_request_duration_seconds_count{") + && l.contains("http_route=\"/boom\"") + && l.contains("http_response_status_code=\"500\"") + }), + "no /boom 500 series in:\n{body}" + ); + + // Unmatched path uses the sentinel. + assert!( + body.lines().any(|l| { + l.starts_with("http_server_request_duration_seconds_count{") + && l.contains("http_route=\"__not_found__\"") + }), + "no __not_found__ series in:\n{body}" + ); + + // busy_duration_total > 0 + let busy = body + .lines() + .find(|l| l.starts_with("http_server_busy_duration_seconds_total{")) + .unwrap_or_else(|| panic!("no busy_duration line in:\n{body}")); + let busy_value: f64 = busy.rsplit_once(' ').unwrap().1.trim().parse().unwrap(); + assert!( + busy_value > 0.0, + "expected busy_duration > 0, got {busy_value}" + ); + + // active_requests returns to 0 after all requests complete. + let active_lines: Vec<_> = body + .lines() + .filter(|l| l.starts_with("http_server_active_requests{")) + .collect(); + for line in &active_lines { + let v: f64 = line.rsplit_once(' ').unwrap().1.trim().parse().unwrap(); + assert_eq!(v, 0.0, "active_requests not zero: {line}"); + } +} + +/// Sanity check that the saturation observers register and the tokio runtime +/// gauges actually appear in the Prometheus scrape under a real tokio runtime. +/// The values themselves come from `tokio::runtime::Handle::metrics()`; we +/// just assert the wiring is intact (presence + plausible workers count). +#[actix_web::test] +async fn runtime_tokio_metrics_appear_after_register_observers() { + let obs = Observability::init(cfg()).unwrap(); + register_observers(&obs.meter(), SaturationDeps::default()).unwrap(); + + let body = scrape(&obs); + + let workers_line = body + .lines() + .find(|l| l.starts_with("runtime_tokio_workers ")) + .unwrap_or_else(|| panic!("no runtime_tokio_workers in:\n{body}")); + let workers: f64 = workers_line.rsplit_once(' ').unwrap().1.trim().parse().unwrap(); + assert!(workers >= 1.0, "expected >=1 worker, got {workers}"); + + assert!( + body.lines().any(|l| l.starts_with("runtime_tokio_global_queue_depth ")), + "no runtime_tokio_global_queue_depth in:\n{body}" + ); + assert!( + body.lines() + .any(|l| l.starts_with("runtime_tokio_workers_busy_time_seconds_total ")), + "no runtime_tokio_workers_busy_time_seconds_total in:\n{body}" + ); +} + +#[actix_web::test] +async fn cardinality_stays_within_budget() { + let obs = Observability::init(cfg()).unwrap(); + let mw = MetricsMiddleware::new(&obs.meter(), LabelConfig::default()); + let app = test::init_service( + App::new() + .wrap(mw) + .route( + "/a", + web::get().to(|| async { HttpResponse::Ok().finish() }), + ) + .route( + "/b", + web::get().to(|| async { HttpResponse::Ok().finish() }), + ) + .route( + "/c", + web::post().to(|| async { HttpResponse::Created().finish() }), + ), + ) + .await; + + for _ in 0..10 { + for path in &["/a", "/b"] { + let req = test::TestRequest::get().uri(path).to_request(); + let _ = test::call_service(&app, req).await; + } + let req = test::TestRequest::post().uri("/c").to_request(); + let _ = test::call_service(&app, req).await; + } + + let body = scrape(&obs); + let series = body + .lines() + .filter(|l| !l.is_empty() && !l.starts_with('#')) + .count(); + + // Budget for this scenario: 3 routes × 1 method each × 1 status × ~12 + // (10 buckets + sum + count) = ~36 series for the histogram, plus 3 for + // busy_duration, plus 1 for active_requests, plus a few from `target_info` + // that the prometheus exporter emits. Headroom: 200. + assert!( + series <= 200, + "cardinality regression: {series} series\n{body}" + ); +} diff --git a/crates/superposition/Cargo.toml b/crates/superposition/Cargo.toml index 26fa4bada..60f613c9c 100644 --- a/crates/superposition/Cargo.toml +++ b/crates/superposition/Cargo.toml @@ -42,6 +42,8 @@ tracing = { workspace = true } tracing-subscriber = { workspace = true } tracing-actix-web = { workspace = true } json-subscriber = { version = "0.2.7", features = ["tracing-log"] } +tokio = { workspace = true } +opentelemetry = { workspace = true } [lints] workspace = true diff --git a/crates/superposition/src/app_state.rs b/crates/superposition/src/app_state.rs index 60a89b515..d6d84d978 100644 --- a/crates/superposition/src/app_state.rs +++ b/crates/superposition/src/app_state.rs @@ -102,13 +102,12 @@ pub async fn get( }, snowflake_generator, app_env, - tenant_middleware_exclusion_list: get_from_env_unsafe::( - "TENANT_MIDDLEWARE_EXCLUSION_LIST", - ) - .expect("TENANT_MIDDLEWARE_EXCLUSION_LIST is not set") - .split(',') - .map(String::from) - .collect::>(), + tenant_middleware_exclusion_list: + get_from_env_unsafe::("TENANT_MIDDLEWARE_EXCLUSION_LIST") + .expect("TENANT_MIDDLEWARE_EXCLUSION_LIST is not set") + .split(',') + .map(String::from) + .collect::>(), service_prefix, superposition_token: get_superposition_token(kms_client, &app_env).await, redis: redis_pool, diff --git a/crates/superposition/src/main.rs b/crates/superposition/src/main.rs index 44d32f407..510ba6380 100644 --- a/crates/superposition/src/main.rs +++ b/crates/superposition/src/main.rs @@ -6,7 +6,7 @@ mod resolve; mod webhooks; mod workspace; -use std::{io::Result, time::Duration}; +use std::{io::Result, sync::Arc, time::Duration}; use actix_files::Files; use actix_web::{ @@ -31,6 +31,10 @@ use service_utils::{ request_response_logging::RequestResponseLogger, workspace_context::OrgWorkspaceMiddlewareFactory, }, + observability::{ + FredPoolStats, MetricsMiddleware, Observability, ObservabilityConfig, RedisStats, + SaturationDeps, register_observers, spawn_metrics_server, + }, service::types::AppEnv, }; use superposition_macros::bad_argument; @@ -77,6 +81,29 @@ async fn main() -> Result<()> { ) .init(); + // --- Step 1: Observability init (early, before AppState build) --- + // `from_env` errors are operator-config mistakes (bad port, bad IP, etc.) — fail loudly. + let obs_cfg = + ObservabilityConfig::from_env().expect("invalid observability env config"); + // `Observability::init` may fail transiently (e.g. OTLP endpoint unreachable at startup). + // Rather than killing the binary we log a warning and serve traffic without metrics. + let observability = if obs_cfg.enabled { + match Observability::init(obs_cfg.clone()) { + Ok(o) => Some(o), + Err(e) => { + tracing::warn!( + error = %e, + "observability init failed; metrics disabled for this instance" + ); + None + } + } + } else { + None + }; + // Reflect actual init outcome: obs_enabled is true only when we have a live Observability. + let obs_enabled = observability.is_some(); + let service_prefix: String = get_from_env_unsafe("SERVICE_PREFIX").expect("SERVICE_PREFIX is not set"); @@ -142,11 +169,59 @@ async fn main() -> Result<()> { .await, ); + // --- Step 2: Register saturation observers --- + // app_state.db_pool is PgSchemaConnectionPool (= Pool>), + // not Arc-wrapped, so we wrap it here. + // app_state.redis is Option. + let redis_handle: Option> = + app_state.redis.as_ref().map(|pool| { + Arc::new(FredPoolStats(pool.clone())) as Arc + }); + + if let Some(obs) = observability.as_ref() { + let deps = SaturationDeps { + db_pool: Some(Arc::new(app_state.db_pool.clone())), + redis_client: redis_handle, + }; + register_observers(&obs.meter(), deps) + .expect("saturation observer registration failed"); + } + + // --- Step 3: Spawn the metrics server --- + let metrics_server_handle = if let Some(obs) = observability.as_ref() { + let bind = std::net::SocketAddr::new(obs_cfg.bind, obs_cfg.port); + match spawn_metrics_server(obs.registry(), bind) { + Ok(h) => Some(h), + Err(e) => { + tracing::warn!( + error = %e, + bind = %bind, + "metrics server bind failed; /metrics endpoint disabled for this instance" + ); + None + } + } + } else { + None + }; + + // --- Step 4: Capture meter + label_cfg for the closure --- + // When obs_enabled is true, observability is Some and meter() is valid. + // When obs_enabled is false, we still need a Meter instance to construct + // MetricsMiddleware inside the closure (Condition evaluates its argument + // regardless of the flag). We use the global noop meter in that case. + let metrics_meter = observability + .as_ref() + .map(|o| o.meter()) + .unwrap_or_else(|| opentelemetry::global::meter("superposition-noop")); + let metrics_label_cfg = obs_cfg.label; + let auth_n = AuthNHandler::init(&kms_client, &app_env, base.clone()).await; let auth_z = AuthZHandler::init(&kms_client, &app_env).await; let auth_z_manager = AuthZManager::init(&kms_client, &app_env).await; - HttpServer::new(move || { + // --- Step 5: Build and run both servers concurrently --- + let main_server = HttpServer::new(move || { let leptos_options = &conf.leptos_options; let site_root = &leptos_options.site_root; let leptos_envs = ui_envs.clone(); @@ -216,6 +291,13 @@ async fn main() -> Result<()> { )) // Conditionally add request/response logging middleware for development .wrap(RequestResponseLogger) + // MetricsMiddleware gated by SUPERPOSITION_METRICS_ENABLED (Approach B: Condition). + // metrics_meter is a real Meter when enabled, or a noop Meter when disabled. + .wrap(Condition::new( + obs_enabled, + MetricsMiddleware::new(&metrics_meter, metrics_label_cfg), + )) + // TracingLogger is outermost — last .wrap() runs first on requests. .wrap(TracingLogger::::new()) }) .bind(("0.0.0.0", cac_port))? @@ -223,8 +305,24 @@ async fn main() -> Result<()> { .keep_alive(Duration::from_secs( get_from_env_unsafe("ACTIX_KEEP_ALIVE").unwrap_or(120), )) - .run() - .await + .run(); + + // --- Step 6: Run the main server; metrics server is a detached best-effort task --- + // Using try_join! would abort the main API server if the metrics task ever returned + // an error (port reclaimed, listener closed). That contradicts the "metrics are + // best-effort" stance applied throughout. Detach instead and log on error. + if let Some(metrics_handle) = metrics_server_handle { + tokio::spawn(async move { + if let Err(e) = metrics_handle.await { + tracing::warn!( + error = %e, + "metrics server exited with error; /metrics endpoint is now unavailable" + ); + } + }); + } + main_server.await?; + Ok(()) } trait ScopeExt { diff --git a/crates/superposition_types/src/database/models.rs b/crates/superposition_types/src/database/models.rs index 1af12e70e..c6544ba24 100644 --- a/crates/superposition_types/src/database/models.rs +++ b/crates/superposition_types/src/database/models.rs @@ -9,8 +9,8 @@ use chrono::{DateTime, Utc}; use derive_more::{Deref, DerefMut}; #[cfg(feature = "diesel_derives")] use diesel::{ - AsChangeset, AsExpression, FromSqlRow, Insertable, QueryId, Queryable, Selectable, sql_types::{Json, Text}, + AsChangeset, AsExpression, FromSqlRow, Insertable, QueryId, Queryable, Selectable, }; use serde::{Deserialize, Deserializer, Serialize}; #[cfg(all( @@ -23,10 +23,10 @@ use superposition_derives::TextFromSqlNoValidation; #[cfg(feature = "diesel_derives")] use superposition_derives::{JsonFromSql, JsonToSql, TextToSql}; -#[cfg(feature = "disable_db_data_validation")] -use super::DisableDBValidation; #[cfg(feature = "diesel_derives")] use super::superposition_schema::superposition::*; +#[cfg(feature = "disable_db_data_validation")] +use super::DisableDBValidation; #[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Deref, DerefMut)] #[serde(try_from = "String")] diff --git a/docs/superpowers/plans/2026-05-10-otel-golden-signals-middleware.md b/docs/superpowers/plans/2026-05-10-otel-golden-signals-middleware.md new file mode 100644 index 000000000..956ef35f6 --- /dev/null +++ b/docs/superpowers/plans/2026-05-10-otel-golden-signals-middleware.md @@ -0,0 +1,2386 @@ +# OpenTelemetry Golden-Signals Middleware Implementation Plan + +> **Status:** Plan executed; shipped with deviations. See "Post-implementation deviations" below before treating any specific task as ground truth. The plan body is preserved as historical record of original intent. + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +## Post-implementation deviations + +The PR shipped with the following changes versus this plan. Affected tasks are flagged inline; the canonical summary is in §0 of the design spec. + +- **Tasks 2, 12, 16, 17 changed substantially or obsolete.** + - **Task 2 (`.cargo/config.toml` for `tokio_unstable`):** not needed. Tokio 1.50 exposes the runtime metrics APIs we use as stable. The file was not created (or removed if already present). + - **Task 12 (`/healthz` `/livez` `/readyz` handlers) + Task 17 (auth-bypass exclusion):** dropped. The pre-existing `GET /health` covers the up-check role; the k8s liveness/readiness split is deferred to a follow-up PR. + - **Task 16 (Tokio runtime saturation):** rewritten. No background sampler, no `RuntimeMonitor`, no atomics snapshot, no `tokio-metrics` dep. Each observable callback reads `Handle::metrics()` directly. `runtime.tokio.workers.busy_ratio` (Gauge) is replaced with `runtime.tokio.workers.busy.time` (monotonic Counter, seconds, summed across workers); Prometheus computes saturation at query time. +- **Task 1 / 3:** `opentelemetry-semantic-conventions` dep is **not** in the final tree — attribute names are used as literals. `tokio-metrics` was added then removed when Task 16 was rewritten. +- **Task 18 (`main.rs` wiring):** `SaturationDeps` no longer has `tokio_collect_interval`; the `.configure(configure_health_endpoints)` line is not present. +- **Task 21 (README + makefile):** the `tokio_unstable` build-flag note is removed; no makefile flag changes are required. + +**Goal:** Add an Actix middleware and supporting subsystem to `crates/service_utils` that emits Google SRE golden signals (latency, traffic, errors, saturation) for every HTTP route on the main API, exposed via Prometheus scrape on a dedicated port and optional OTLP push, using OpenTelemetry. + +**Architecture:** New `service_utils::observability` module owns: (a) `init()` that builds an OTel `MeterProvider` with a Prometheus exporter and an optional OTLP exporter, (b) an Actix `MetricsMiddleware` that records `http.server.request.duration` (histogram), `http.server.busy.duration` (counter), and `http.server.active_requests` (UpDownCounter) for every request, (c) saturation collectors for r2d2 DB pool, fred Redis pool, and (cfg-gated) tokio-metrics, and (d) a separate `HttpServer` on `SUPERPOSITION_METRICS_PORT` exposing `/metrics`. Health endpoints `/healthz`/`/livez`/`/readyz` mount on the main port and bypass auth via the existing `tenant_middleware_exclusion_list`. + +**Tech Stack:** Rust, Actix-web 4, OpenTelemetry SDK 0.27 (`opentelemetry`, `opentelemetry_sdk`, `opentelemetry-prometheus`, `opentelemetry-otlp`, `opentelemetry-semantic-conventions`), `prometheus` 0.13, `tokio-metrics` 0.3 (under `cfg(tokio_unstable)`), Diesel/r2d2, fred (Redis client). + +**Spec:** [`docs/superpowers/specs/2026-05-10-otel-golden-signals-middleware-design.md`](../specs/2026-05-10-otel-golden-signals-middleware-design.md) + +**Notes for the implementer:** + +- The OpenTelemetry Rust SDK has had API churn between minor versions. The exact import paths and builder method names below are written against `opentelemetry` 0.27. If you pin a different version in Task 1, expect to adjust 1–3 import paths or method names per call site. The plan uses the **stable** APIs only (no unstable/preview features). +- After Task 1 (deps), run `cargo check -p service_utils` after every code-touching task to catch wiring issues early — even on tasks that don't add tests yet. +- File commit boundary: each task ends with one commit. If a step within a task fails, fix and continue within the same task before committing. +- All paths in this plan are relative to the repo root: `/`. + +--- + +## Task 1: Add workspace dependencies + +**Files:** +- Modify: `Cargo.toml` (root) + +- [ ] **Step 1: Add OpenTelemetry deps to `[workspace.dependencies]`** + +Add the following block to the `[workspace.dependencies]` section of the root `Cargo.toml` (alphabetical order, near the existing entries like `prometheus`-adjacent / `opentelemetry`-adjacent slots): + +```toml +opentelemetry = { version = "0.27", default-features = false, features = ["metrics"] } +opentelemetry_sdk = { version = "0.27", default-features = false, features = ["metrics", "rt-tokio"] } +opentelemetry-prometheus = { version = "0.27", default-features = false } +opentelemetry-otlp = { version = "0.27", default-features = false, features = ["metrics", "http-proto", "reqwest-client"] } +opentelemetry-semantic-conventions = { version = "0.27" } +prometheus = { version = "0.13", default-features = false } +tokio-metrics = { version = "0.3", default-features = false, features = ["rt"] } +humantime = "2.1" +``` + +- [ ] **Step 2: Verify the workspace still resolves** + +Run: `cargo metadata --format-version 1 > /dev/null` +Expected: exit code 0, no errors. (This forces `cargo` to re-resolve the workspace without compiling.) + +- [ ] **Step 3: Commit** + +```bash +git add Cargo.toml +git commit -m "build: add opentelemetry deps to workspace + +Adds opentelemetry, opentelemetry_sdk, opentelemetry-prometheus, +opentelemetry-otlp, opentelemetry-semantic-conventions, prometheus, +tokio-metrics, and humantime as workspace dependencies. Enabled in +service_utils in a follow-up commit. +" +``` + +--- + +## Task 2: Add `.cargo/config.toml` for `tokio_unstable` + +**Files:** +- Create: `.cargo/config.toml` + +- [ ] **Step 1: Create the file** + +```toml +# Required by tokio-metrics for runtime instrumentation. Affects all +# crates in the workspace; only the saturation::tokio_runtime module +# consumes the additional APIs that this flag unlocks. +[build] +rustflags = ["--cfg", "tokio_unstable"] +``` + +- [ ] **Step 2: Verify the workspace still builds** + +Run: `cargo check --workspace` +Expected: exit code 0. (Build may take a while on first run; that's fine.) + +- [ ] **Step 3: Commit** + +```bash +git add .cargo/config.toml +git commit -m "build: enable tokio_unstable workspace-wide + +Required by tokio-metrics for runtime instrumentation introduced in +the upcoming observability subsystem. tokio_unstable only adds APIs; +no behavioural change for existing code. +" +``` + +--- + +## Task 3: Enable observability deps in `service_utils` + +**Files:** +- Modify: `crates/service_utils/Cargo.toml` + +- [ ] **Step 1: Add the dependency lines** + +Append to the `[dependencies]` block of `crates/service_utils/Cargo.toml` (after the existing entries, preserving alphabetical-ish order): + +```toml +opentelemetry = { workspace = true } +opentelemetry_sdk = { workspace = true } +opentelemetry-prometheus = { workspace = true } +opentelemetry-otlp = { workspace = true } +opentelemetry-semantic-conventions = { workspace = true } +prometheus = { workspace = true } +tokio-metrics = { workspace = true } +humantime = { workspace = true } +``` + +(`fred` already has the `metrics` feature enabled at line 22 — no change needed there.) + +- [ ] **Step 2: Verify the crate still compiles** + +Run: `cargo check -p service_utils` +Expected: exit code 0. + +- [ ] **Step 3: Commit** + +```bash +git add crates/service_utils/Cargo.toml +git commit -m "build(service_utils): enable opentelemetry deps" +``` + +--- + +## Task 4: Module skeleton in `service_utils` + +**Files:** +- Modify: `crates/service_utils/src/lib.rs` +- Create: `crates/service_utils/src/observability.rs` + +- [ ] **Step 1: Add the `pub mod` line** + +Edit `crates/service_utils/src/lib.rs`, adding a new line in alphabetical position: + +```rust +pub mod aws; +pub mod db; +pub mod encryption; +pub mod extensions; +pub mod helpers; +pub mod middlewares; +pub mod observability; // <-- NEW LINE, between middlewares and redis +pub mod redis; +pub mod registry; +pub mod service; +``` + +- [ ] **Step 2: Create `observability.rs` with public surface stubs** + +```rust +//! HTTP golden-signals metrics exposition via OpenTelemetry. +//! +//! See `docs/superpowers/specs/2026-05-10-otel-golden-signals-middleware-design.md`. + +mod config; +mod health; +mod meters; +mod metrics_server; +mod middleware; +mod saturation; + +pub use config::{LabelConfig, ObservabilityConfig}; +pub use health::{health_endpoint_paths, health_endpoints}; +pub use metrics_server::spawn_metrics_server; +pub use middleware::MetricsMiddleware; +pub use saturation::{register_observers, SaturationDeps}; + +use std::sync::Arc; + +use opentelemetry::metrics::Meter; +use opentelemetry_sdk::metrics::SdkMeterProvider; +use prometheus::Registry; +use thiserror::Error; + +#[derive(Debug, Error)] +pub enum ObservabilityError { + #[error("prometheus exporter init failed: {0}")] + PrometheusInit(String), + #[error("otlp exporter init failed: {0}")] + OtlpInit(String), + #[error("config error: {0}")] + Config(String), + #[error(transparent)] + Io(#[from] std::io::Error), +} + +pub struct Observability { + provider: SdkMeterProvider, + registry: Arc, + meter: Meter, +} + +impl Observability { + pub fn meter(&self) -> Meter { + self.meter.clone() + } + + pub fn registry(&self) -> Arc { + self.registry.clone() + } + + pub fn shutdown(self) -> Result<(), ObservabilityError> { + self.provider + .shutdown() + .map_err(|e| ObservabilityError::PrometheusInit(e.to_string())) + } + + pub fn init(_cfg: ObservabilityConfig) -> Result { + // Real implementation lands in Task 7. + unimplemented!("Observability::init implemented in Task 7") + } +} +``` + +(`thiserror` is already used elsewhere in the workspace; if `service_utils/Cargo.toml` does not yet depend on it, add `thiserror = "1"` to the `[dependencies]` block. Quick check: `grep thiserror crates/service_utils/Cargo.toml`.) + +- [ ] **Step 3: Verify it compiles** + +Run: `cargo check -p service_utils` +Expected: exit code 0. The stub modules listed in the `mod` declarations don't exist yet, so compile may fail — proceed to step 4 if so. + +- [ ] **Step 4: Create empty stub files for child modules so this task ends compilable** + +Create each of: + +- `crates/service_utils/src/observability/config.rs`: + ```rust + //! Stub — real implementation in Task 5. + pub struct ObservabilityConfig; + pub struct LabelConfig; + ``` +- `crates/service_utils/src/observability/meters.rs`: + ```rust + //! Stub — real implementation in Task 7. + ``` +- `crates/service_utils/src/observability/middleware.rs`: + ```rust + //! Stub — real implementation in Task 11. + pub struct MetricsMiddleware; + ``` +- `crates/service_utils/src/observability/metrics_server.rs`: + ```rust + //! Stub — real implementation in Task 13. + use std::{net::SocketAddr, sync::Arc}; + use prometheus::Registry; + pub fn spawn_metrics_server( + _registry: Arc, + _bind: SocketAddr, + ) -> std::io::Result { + unimplemented!("Task 13") + } + ``` +- `crates/service_utils/src/observability/health.rs`: + ```rust + //! Stub — real implementation in Task 12. + pub fn health_endpoints() -> actix_web::Scope { + actix_web::web::scope("") + } + pub fn health_endpoint_paths() -> &'static [&'static str] { + &[] + } + ``` +- `crates/service_utils/src/observability/saturation.rs`: + ```rust + //! Stub — real implementation in Task 14. + use opentelemetry::metrics::Meter; + pub struct SaturationDeps; + pub fn register_observers( + _meter: &Meter, + _deps: SaturationDeps, + ) -> Result<(), super::ObservabilityError> { + Ok(()) + } + ``` + +- [ ] **Step 5: Verify it compiles** + +Run: `cargo check -p service_utils` +Expected: exit code 0. + +- [ ] **Step 6: Commit** + +```bash +git add crates/service_utils/src/lib.rs crates/service_utils/src/observability.rs crates/service_utils/src/observability/ +git commit -m "feat(observability): module skeleton + +Adds the empty module structure that subsequent commits flesh out: +- observability.rs: public surface, Observability handle, errors +- observability/{config,meters,middleware,metrics_server,health,saturation}.rs: stubs + +No behaviour change. The Observability::init() body is unimplemented!() +until Task 7. +" +``` + +--- + +## Task 5: `ObservabilityConfig` from env + +**Files:** +- Modify: `crates/service_utils/src/observability/config.rs` + +- [ ] **Step 1: Write the failing test** + +Replace the contents of `crates/service_utils/src/observability/config.rs` with: + +```rust +//! Configuration for the observability subsystem, parsed from env vars. + +use std::{net::IpAddr, str::FromStr, time::Duration}; + +#[derive(Debug, Clone)] +pub struct ObservabilityConfig { + pub enabled: bool, + pub bind: IpAddr, + pub port: u16, + pub label: LabelConfig, + pub collect_interval: Duration, + pub instance_id: String, + pub service_name: String, + pub service_version: String, + pub deployment_environment: Option, + pub otlp_endpoint: Option, +} + +#[derive(Debug, Clone, Copy)] +pub struct LabelConfig { + pub with_org_label: bool, + pub with_workspace_label: bool, +} + +impl Default for LabelConfig { + fn default() -> Self { + Self { with_org_label: true, with_workspace_label: true } + } +} + +impl ObservabilityConfig { + pub fn from_env() -> Result { + fn env_bool(key: &str, default: bool) -> Result { + match std::env::var(key) { + Ok(v) => v.parse::().map_err(|_| format!("{key} must be true or false")), + Err(_) => Ok(default), + } + } + fn env_str(key: &str, default: &str) -> String { + std::env::var(key).unwrap_or_else(|_| default.to_owned()) + } + fn env_opt(key: &str) -> Option { + std::env::var(key).ok().filter(|s| !s.is_empty()) + } + + let enabled = env_bool("SUPERPOSITION_METRICS_ENABLED", true)?; + let bind = IpAddr::from_str(&env_str("SUPERPOSITION_METRICS_BIND", "0.0.0.0")) + .map_err(|e| format!("SUPERPOSITION_METRICS_BIND: {e}"))?; + let port: u16 = env_str("SUPERPOSITION_METRICS_PORT", "9091") + .parse() + .map_err(|e| format!("SUPERPOSITION_METRICS_PORT: {e}"))?; + let with_org_label = env_bool("SUPERPOSITION_METRICS_LABEL_ORG", true)?; + let with_workspace_label = env_bool("SUPERPOSITION_METRICS_LABEL_WORKSPACE", true)?; + let collect_interval = + humantime::parse_duration(&env_str("SUPERPOSITION_METRICS_COLLECT_INTERVAL", "10s")) + .map_err(|e| format!("SUPERPOSITION_METRICS_COLLECT_INTERVAL: {e}"))?; + let instance_id = env_opt("SUPERPOSITION_INSTANCE_ID") + .or_else(|| hostname_or_none()) + .unwrap_or_else(|| "unknown".to_owned()); + let service_name = env_str("OTEL_SERVICE_NAME", "superposition"); + let service_version = env!("CARGO_PKG_VERSION").to_owned(); + let deployment_environment = env_opt("APP_ENV").or_else(|| env_opt("DEPLOYMENT_ENV")); + let otlp_endpoint = env_opt("OTEL_EXPORTER_OTLP_ENDPOINT"); + + Ok(Self { + enabled, + bind, + port, + label: LabelConfig { with_org_label, with_workspace_label }, + collect_interval, + instance_id, + service_name, + service_version, + deployment_environment, + otlp_endpoint, + }) + } +} + +fn hostname_or_none() -> Option { + // Avoid pulling in a hostname crate; read /etc/hostname on Linux/macOS. + std::fs::read_to_string("/etc/hostname") + .ok() + .map(|s| s.trim().to_owned()) + .filter(|s| !s.is_empty()) +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Tests run sequentially via `serial_test` to avoid env races. + /// We use a simple lock + a env-snapshot helper instead of adding a new dep. + fn with_env(vars: &[(&str, Option<&str>)], f: F) { + use std::sync::Mutex; + static LOCK: Mutex<()> = Mutex::new(()); + let _guard = LOCK.lock().unwrap(); + let prev: Vec<_> = + vars.iter().map(|(k, _)| (k.to_string(), std::env::var(k).ok())).collect(); + for (k, v) in vars { + match v { + Some(v) => std::env::set_var(k, v), + None => std::env::remove_var(k), + } + } + f(); + for (k, v) in prev { + match v { + Some(v) => std::env::set_var(&k, &v), + None => std::env::remove_var(&k), + } + } + } + + #[test] + fn defaults_when_unset() { + with_env( + &[ + ("SUPERPOSITION_METRICS_ENABLED", None), + ("SUPERPOSITION_METRICS_PORT", None), + ("SUPERPOSITION_METRICS_BIND", None), + ("SUPERPOSITION_METRICS_LABEL_ORG", None), + ("SUPERPOSITION_METRICS_LABEL_WORKSPACE", None), + ("SUPERPOSITION_METRICS_COLLECT_INTERVAL", None), + ("OTEL_EXPORTER_OTLP_ENDPOINT", None), + ("OTEL_SERVICE_NAME", None), + ], + || { + let cfg = ObservabilityConfig::from_env().unwrap(); + assert!(cfg.enabled); + assert_eq!(cfg.port, 9091); + assert_eq!(cfg.bind.to_string(), "0.0.0.0"); + assert!(cfg.label.with_org_label); + assert!(cfg.label.with_workspace_label); + assert_eq!(cfg.collect_interval, Duration::from_secs(10)); + assert_eq!(cfg.service_name, "superposition"); + assert_eq!(cfg.otlp_endpoint, None); + }, + ); + } + + #[test] + fn explicit_overrides() { + with_env( + &[ + ("SUPERPOSITION_METRICS_ENABLED", Some("false")), + ("SUPERPOSITION_METRICS_PORT", Some("9999")), + ("SUPERPOSITION_METRICS_BIND", Some("127.0.0.1")), + ("SUPERPOSITION_METRICS_LABEL_WORKSPACE", Some("false")), + ("SUPERPOSITION_METRICS_COLLECT_INTERVAL", Some("30s")), + ("OTEL_EXPORTER_OTLP_ENDPOINT", Some("http://collector:4318")), + ("OTEL_SERVICE_NAME", Some("sp-test")), + ], + || { + let cfg = ObservabilityConfig::from_env().unwrap(); + assert!(!cfg.enabled); + assert_eq!(cfg.port, 9999); + assert_eq!(cfg.bind.to_string(), "127.0.0.1"); + assert!(cfg.label.with_org_label); // default still true + assert!(!cfg.label.with_workspace_label); + assert_eq!(cfg.collect_interval, Duration::from_secs(30)); + assert_eq!(cfg.otlp_endpoint.as_deref(), Some("http://collector:4318")); + assert_eq!(cfg.service_name, "sp-test"); + }, + ); + } + + #[test] + fn malformed_port_errors() { + with_env( + &[("SUPERPOSITION_METRICS_PORT", Some("not-a-number"))], + || { + let err = ObservabilityConfig::from_env().unwrap_err(); + assert!(err.contains("SUPERPOSITION_METRICS_PORT")); + }, + ); + } +} +``` + +- [ ] **Step 2: Run the tests** + +Run: `cargo test -p service_utils observability::config -- --test-threads=1` +Expected: 3 tests pass. + +(`--test-threads=1` is required because the tests mutate process env vars; the in-test mutex covers same-binary races but doctests/other tests in parallel could interleave.) + +- [ ] **Step 3: Commit** + +```bash +git add crates/service_utils/src/observability/config.rs +git commit -m "feat(observability): config from env + +Reads SUPERPOSITION_METRICS_* and OTEL_* env vars into a typed +ObservabilityConfig. Defaults: enabled, port 9091, bind 0.0.0.0, +both org/workspace labels on, 10s collect interval. +" +``` + +--- + +## Task 6: HTTP method normalization helper + +**Files:** +- Modify: `crates/service_utils/src/observability/middleware.rs` + +- [ ] **Step 1: Replace stub with TDD scaffold** + +```rust +//! Actix middleware that records OpenTelemetry HTTP server metrics. + +/// Per OpenTelemetry HTTP semantic conventions, only known methods get their +/// literal name; anything else collapses to `_OTHER`. Prevents weirdo clients +/// from blowing up the cardinality of the `http.request.method` attribute. +pub(crate) fn normalize_method(m: &actix_web::http::Method) -> &'static str { + match m.as_str() { + "GET" => "GET", + "POST" => "POST", + "PUT" => "PUT", + "DELETE" => "DELETE", + "PATCH" => "PATCH", + "HEAD" => "HEAD", + "OPTIONS" => "OPTIONS", + "TRACE" => "TRACE", + "CONNECT" => "CONNECT", + _ => "_OTHER", + } +} + +pub struct MetricsMiddleware; // placeholder until Task 11 + +#[cfg(test)] +mod tests { + use super::*; + use actix_web::http::Method; + + #[test] + fn known_methods_pass_through() { + for (m, expected) in [ + (Method::GET, "GET"), + (Method::POST, "POST"), + (Method::PUT, "PUT"), + (Method::DELETE, "DELETE"), + (Method::PATCH, "PATCH"), + (Method::HEAD, "HEAD"), + (Method::OPTIONS, "OPTIONS"), + (Method::TRACE, "TRACE"), + (Method::CONNECT, "CONNECT"), + ] { + assert_eq!(normalize_method(&m), expected); + } + } + + #[test] + fn unknown_methods_collapse_to_other() { + let m = Method::from_bytes(b"XPROPFIND").unwrap(); + assert_eq!(normalize_method(&m), "_OTHER"); + let m = Method::from_bytes(b"WEIRDO").unwrap(); + assert_eq!(normalize_method(&m), "_OTHER"); + } +} +``` + +- [ ] **Step 2: Run the tests** + +Run: `cargo test -p service_utils observability::middleware` +Expected: 2 tests pass. + +- [ ] **Step 3: Commit** + +```bash +git add crates/service_utils/src/observability/middleware.rs +git commit -m "feat(observability): http method normalization + +Per OTel semconv: collapse unknown methods to _OTHER to bound the +cardinality of the http.request.method label. +" +``` + +--- + +## Task 7: `Observability::init()` with Prometheus exporter + +**Files:** +- Modify: `crates/service_utils/src/observability.rs` +- Modify: `crates/service_utils/src/observability/meters.rs` + +- [ ] **Step 1: Define `HttpMeters` struct** + +Replace `crates/service_utils/src/observability/meters.rs`: + +```rust +//! Typed handles for the metric instruments emitted by the HTTP middleware. + +use opentelemetry::metrics::{Counter, Histogram, Meter, UpDownCounter}; + +/// Histogram + counter + gauge for HTTP server golden signals. Built once at +/// startup and cloned cheaply; instruments are `Arc<>`-backed internally. +#[derive(Clone)] +pub struct HttpMeters { + pub request_duration: Histogram, + pub busy_duration: Counter, + pub active_requests: UpDownCounter, +} + +impl HttpMeters { + pub fn new(meter: &Meter) -> Self { + let request_duration = meter + .f64_histogram("http.server.request.duration") + .with_unit("s") + .with_description("Duration of HTTP server requests, in seconds.") + .with_boundaries(vec![ + 0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, + ]) + .build(); + + let busy_duration = meter + .f64_counter("http.server.busy.duration") + .with_unit("s") + .with_description( + "Cumulative seconds spent serving HTTP requests; \ + rate() over a window gives time-averaged request concurrency.", + ) + .build(); + + let active_requests = meter + .i64_up_down_counter("http.server.active_requests") + .with_description("Number of HTTP server requests currently in flight.") + .build(); + + Self { request_duration, busy_duration, active_requests } + } +} +``` + +- [ ] **Step 2: Implement `Observability::init`** + +Replace the body of `Observability::init` in `crates/service_utils/src/observability.rs`: + +```rust +impl Observability { + pub fn init(cfg: ObservabilityConfig) -> Result { + use opentelemetry::KeyValue; + use opentelemetry_sdk::Resource; + use opentelemetry_sdk::metrics::SdkMeterProvider; + + let registry = Arc::new(prometheus::Registry::new()); + + let exporter = opentelemetry_prometheus::exporter() + .with_registry((*registry).clone()) + .build() + .map_err(|e| ObservabilityError::PrometheusInit(e.to_string()))?; + + let mut resource_attrs = vec![ + KeyValue::new("service.name", cfg.service_name.clone()), + KeyValue::new("service.version", cfg.service_version.clone()), + KeyValue::new("service.instance.id", cfg.instance_id.clone()), + ]; + if let Some(env) = &cfg.deployment_environment { + resource_attrs.push(KeyValue::new("deployment.environment", env.clone())); + } + + let mut builder = SdkMeterProvider::builder() + .with_reader(exporter) + .with_resource(Resource::new(resource_attrs)); + + if let Some(endpoint) = &cfg.otlp_endpoint { + builder = with_otlp_reader(builder, endpoint, cfg.collect_interval)?; + } + + let provider = builder.build(); + opentelemetry::global::set_meter_provider(provider.clone()); + let meter = provider.meter("superposition"); + + Ok(Self { provider, registry, meter }) + } +} + +#[cfg(not(test))] +fn with_otlp_reader( + builder: opentelemetry_sdk::metrics::MeterProviderBuilder, + endpoint: &str, + interval: std::time::Duration, +) -> Result { + use opentelemetry_otlp::{MetricExporter, WithExportConfig}; + use opentelemetry_sdk::metrics::PeriodicReader; + use opentelemetry_sdk::runtime; + + let exporter = MetricExporter::builder() + .with_http() + .with_endpoint(endpoint.to_owned()) + .build() + .map_err(|e| ObservabilityError::OtlpInit(e.to_string()))?; + + let reader = PeriodicReader::builder(exporter, runtime::Tokio) + .with_interval(interval) + .build(); + + Ok(builder.with_reader(reader)) +} + +#[cfg(test)] +fn with_otlp_reader( + builder: opentelemetry_sdk::metrics::MeterProviderBuilder, + _endpoint: &str, + _interval: std::time::Duration, +) -> Result { + // OTLP exporter requires a tokio runtime; we don't spin one up in unit tests. + Ok(builder) +} +``` + +- [ ] **Step 3: Add a smoke test** + +Append to `crates/service_utils/src/observability.rs`: + +```rust +#[cfg(test)] +mod tests { + use super::*; + + fn test_cfg() -> ObservabilityConfig { + ObservabilityConfig { + enabled: true, + bind: "127.0.0.1".parse().unwrap(), + port: 0, + label: LabelConfig::default(), + collect_interval: std::time::Duration::from_secs(10), + instance_id: "test".into(), + service_name: "sp-test".into(), + service_version: "0.0.0-test".into(), + deployment_environment: None, + otlp_endpoint: None, + } + } + + #[test] + fn init_builds_meter_and_registry() { + let obs = Observability::init(test_cfg()).expect("init failed"); + let _meter = obs.meter(); + let registry = obs.registry(); + assert_eq!(registry.gather().len(), 0, "no metrics emitted yet"); + } + + #[test] + fn meter_can_record_a_histogram_and_register_it_in_registry() { + let obs = Observability::init(test_cfg()).unwrap(); + let meter = obs.meter(); + let h = meter.f64_histogram("test.duration").with_unit("s").build(); + h.record(0.123, &[]); + + let mut buf = Vec::new(); + let encoder = prometheus::TextEncoder::new(); + let metric_families = obs.registry().gather(); + prometheus::Encoder::encode(&encoder, &metric_families, &mut buf).unwrap(); + let text = String::from_utf8(buf).unwrap(); + assert!( + text.contains("test_duration"), + "expected test_duration in exposition, got:\n{text}" + ); + } +} +``` + +- [ ] **Step 4: Run the tests** + +Run: `cargo test -p service_utils observability::tests` +Expected: 2 tests pass. + +If you get a compile error about `with_boundaries` not existing, the OpenTelemetry SDK version you pinned uses the older `with_explicit_buckets` name. Adjust the call in `meters.rs` accordingly. Same for `MetricExporter::builder().with_http()` — older versions used `new_exporter().http()`. + +- [ ] **Step 5: Commit** + +```bash +git add crates/service_utils/src/observability.rs crates/service_utils/src/observability/meters.rs +git commit -m "feat(observability): MeterProvider with prometheus exporter + +Builds an SdkMeterProvider wired to an opentelemetry-prometheus +exporter that writes into a per-process prometheus::Registry. OTLP +push exporter is plumbed but only activates when +OTEL_EXPORTER_OTLP_ENDPOINT is set. +" +``` + +--- + +## Task 8: Route template extraction helper + +**Files:** +- Modify: `crates/service_utils/src/observability/middleware.rs` + +- [ ] **Step 1: Add helper + tests** + +Append to `crates/service_utils/src/observability/middleware.rs`: + +```rust +use actix_web::dev::ServiceRequest; + +/// Sentinel for paths that did not match any registered route (would 404). +pub(crate) const ROUTE_NOT_FOUND: &str = "__not_found__"; + +/// Extracts the templated route pattern from a ServiceRequest. Falls back to +/// a sentinel when no route matched, to keep `http.route` cardinality bounded. +pub(crate) fn extract_route(req: &ServiceRequest) -> String { + req.match_pattern().unwrap_or_else(|| ROUTE_NOT_FOUND.to_owned()) +} +``` + +Add tests inside the existing `#[cfg(test)] mod tests` block: + +```rust + use actix_web::{App, HttpResponse, http::StatusCode, test, web}; + + #[actix_web::test] + async fn matched_route_returns_pattern() { + let app = test::init_service( + App::new().route( + "/contexts/{id}", + web::get().to(|| async { HttpResponse::Ok() }), + ), + ) + .await; + let req = test::TestRequest::get().uri("/contexts/abc123").to_request(); + let resp = test::call_service(&app, req).await; + assert_eq!(resp.status(), StatusCode::OK); + // Note: extract_route is exercised in the integration test in Task 15 + // because match_pattern() is only populated mid-pipeline. This unit-test + // stub is kept for build-coverage of the call site. + } +``` + +- [ ] **Step 2: Run the tests** + +Run: `cargo test -p service_utils observability::middleware` +Expected: previous 2 tests + 1 new still pass. + +- [ ] **Step 3: Commit** + +```bash +git add crates/service_utils/src/observability/middleware.rs +git commit -m "feat(observability): route template extraction helper" +``` + +--- + +## Task 9: Label extraction with org/workspace from extensions + +**Files:** +- Modify: `crates/service_utils/src/observability/middleware.rs` + +- [ ] **Step 1: Confirm extension types** + +Run: `grep -rn "OrgId\|WorkspaceName\|insert::<.*Workspace" /Users/natarajankannan/src/superposition/crates/service_utils/src/middlewares/ 2>/dev/null | head -10` +Expected: shows the actual type names that `OrgWorkspaceMiddlewareFactory` inserts into request extensions. The likely candidates are `OrgId(String)` and `WorkspaceName(String)` or similar newtypes from `superposition_types`. + +If the grep shows different type names, use those in the code below in place of the placeholders. + +- [ ] **Step 2: Add label-build helper + tests** + +Append to `crates/service_utils/src/observability/middleware.rs`: + +```rust +use opentelemetry::KeyValue; +use crate::observability::config::LabelConfig; + +/// Build the OTel attributes set for a single HTTP request. Reads org_id / +/// workspace_id from request extensions if `OrgWorkspaceMiddlewareFactory` +/// has populated them; otherwise omits those attributes entirely (rather +/// than emitting an empty string, which would create a distinct series). +pub(crate) fn build_attributes( + method: &'static str, + route: &str, + status_code: u16, + org_id: Option<&str>, + workspace: Option<&str>, + label_cfg: &LabelConfig, +) -> Vec { + let mut attrs = Vec::with_capacity(5); + attrs.push(KeyValue::new("http.request.method", method)); + attrs.push(KeyValue::new("http.route", route.to_owned())); + attrs.push(KeyValue::new("http.response.status_code", status_code as i64)); + if label_cfg.with_org_label { + if let Some(o) = org_id { + attrs.push(KeyValue::new("sp.org_id", o.to_owned())); + } + } + if label_cfg.with_workspace_label { + if let Some(w) = workspace { + attrs.push(KeyValue::new("sp.workspace_id", w.to_owned())); + } + } + attrs +} +``` + +Add to the test block: + +```rust + #[test] + fn build_attributes_with_all_labels() { + let cfg = LabelConfig { with_org_label: true, with_workspace_label: true }; + let attrs = build_attributes("GET", "/contexts/{id}", 200, Some("org1"), Some("ws1"), &cfg); + assert_eq!(attrs.len(), 5); + assert!(attrs.iter().any(|kv| kv.key.as_str() == "sp.org_id")); + assert!(attrs.iter().any(|kv| kv.key.as_str() == "sp.workspace_id")); + } + + #[test] + fn build_attributes_omits_missing_workspace() { + let cfg = LabelConfig { with_org_label: true, with_workspace_label: true }; + let attrs = build_attributes("POST", "/orgs", 201, Some("org1"), None, &cfg); + assert_eq!(attrs.len(), 4); + assert!(attrs.iter().any(|kv| kv.key.as_str() == "sp.org_id")); + assert!(!attrs.iter().any(|kv| kv.key.as_str() == "sp.workspace_id")); + } + + #[test] + fn build_attributes_respects_disable_flag() { + let cfg = LabelConfig { with_org_label: false, with_workspace_label: false }; + let attrs = build_attributes("GET", "/x", 200, Some("org1"), Some("ws1"), &cfg); + assert_eq!(attrs.len(), 3); + assert!(!attrs.iter().any(|kv| kv.key.as_str() == "sp.org_id")); + assert!(!attrs.iter().any(|kv| kv.key.as_str() == "sp.workspace_id")); + } +``` + +- [ ] **Step 3: Run the tests** + +Run: `cargo test -p service_utils observability::middleware` +Expected: all middleware tests pass. + +- [ ] **Step 4: Commit** + +```bash +git add crates/service_utils/src/observability/middleware.rs +git commit -m "feat(observability): build OTel attributes for HTTP metrics" +``` + +--- + +## Task 10: `InFlightGuard` (panic-safe active-requests decrement) + +**Files:** +- Modify: `crates/service_utils/src/observability/middleware.rs` + +- [ ] **Step 1: Add the guard + tests** + +Append to `middleware.rs`: + +```rust +use std::sync::atomic::{AtomicBool, Ordering}; +use opentelemetry::metrics::UpDownCounter; + +/// RAII guard that decrements `http.server.active_requests` on Drop unless +/// `release()` was called. Ensures a panicking handler still decrements the +/// gauge. +pub(crate) struct InFlightGuard { + counter: UpDownCounter, + method: &'static str, + decremented: AtomicBool, +} + +impl InFlightGuard { + pub(crate) fn enter(counter: UpDownCounter, method: &'static str) -> Self { + counter.add(1, &[KeyValue::new("http.request.method", method)]); + Self { + counter, + method, + decremented: AtomicBool::new(false), + } + } + + pub(crate) fn release(&self) { + if !self.decremented.swap(true, Ordering::Relaxed) { + self.counter.add( + -1, + &[KeyValue::new("http.request.method", self.method)], + ); + } + } +} + +impl Drop for InFlightGuard { + fn drop(&mut self) { + self.release(); + } +} +``` + +Add a test (this requires a real meter; we get one from `Observability::init`): + +```rust + #[test] + fn guard_decrements_on_drop_only_once() { + use crate::observability::{Observability, ObservabilityConfig, LabelConfig}; + use std::time::Duration; + + let cfg = ObservabilityConfig { + enabled: true, + bind: "127.0.0.1".parse().unwrap(), + port: 0, + label: LabelConfig::default(), + collect_interval: Duration::from_secs(10), + instance_id: "test".into(), + service_name: "sp-test".into(), + service_version: "0".into(), + deployment_environment: None, + otlp_endpoint: None, + }; + let obs = Observability::init(cfg).unwrap(); + let m = obs.meter().i64_up_down_counter("test.in_flight").build(); + + { + let g = InFlightGuard::enter(m.clone(), "GET"); + g.release(); + // Drop after explicit release; should be a no-op. + } + // Hard to introspect the counter value from outside, but we can call + // release multiple times and ensure no panic. + let g = InFlightGuard::enter(m.clone(), "POST"); + g.release(); + g.release(); + drop(g); + } +``` + +- [ ] **Step 2: Run the tests** + +Run: `cargo test -p service_utils observability::middleware` +Expected: all middleware tests pass. + +- [ ] **Step 3: Commit** + +```bash +git add crates/service_utils/src/observability/middleware.rs +git commit -m "feat(observability): RAII guard for active_requests gauge + +Drop-based decrement ensures the gauge stays balanced even when a +handler panics or the future is cancelled. +" +``` + +--- + +## Task 11: Full `MetricsMiddleware` (Transform + Service) + +**Files:** +- Modify: `crates/service_utils/src/observability/middleware.rs` + +- [ ] **Step 1: Implement the full middleware** + +Replace the placeholder `pub struct MetricsMiddleware;` with the full implementation. Append/replace at the bottom of `middleware.rs` (keep all helpers and tests above intact): + +```rust +use std::future::{Ready, ready}; +use std::pin::Pin; +use std::rc::Rc; +use std::task::{Context, Poll}; +use std::time::Instant; + +use actix_web::{ + Error, HttpMessage, + body::MessageBody, + dev::{Service, ServiceResponse, Transform, forward_ready}, +}; +use futures_util::future::LocalBoxFuture; +use opentelemetry::metrics::Meter; + +use crate::observability::config::LabelConfig; +use crate::observability::meters::HttpMeters; + +#[derive(Clone)] +pub struct MetricsMiddleware { + meters: HttpMeters, + label_cfg: LabelConfig, +} + +impl MetricsMiddleware { + pub fn new(meter: &Meter, label_cfg: LabelConfig) -> Self { + Self { meters: HttpMeters::new(meter), label_cfg } + } +} + +impl Transform for MetricsMiddleware +where + S: Service, Error = Error> + 'static, + S::Future: 'static, + B: MessageBody + 'static, +{ + type Response = ServiceResponse; + type Error = Error; + type InitError = (); + type Transform = MetricsMiddlewareImpl; + type Future = Ready>; + + fn new_transform(&self, service: S) -> Self::Future { + ready(Ok(MetricsMiddlewareImpl { + service: Rc::new(service), + meters: self.meters.clone(), + label_cfg: self.label_cfg, + })) + } +} + +pub struct MetricsMiddlewareImpl { + service: Rc, + meters: HttpMeters, + label_cfg: LabelConfig, +} + +impl Service for MetricsMiddlewareImpl +where + S: Service, Error = Error> + 'static, + S::Future: 'static, + B: MessageBody + 'static, +{ + type Response = ServiceResponse; + type Error = Error; + type Future = LocalBoxFuture<'static, Result>; + + forward_ready!(service); + + fn call(&self, req: ServiceRequest) -> Self::Future { + let service = self.service.clone(); + let meters = self.meters.clone(); + let label_cfg = self.label_cfg; + + let method_normalized = normalize_method(req.method()); + let start = Instant::now(); + let guard = InFlightGuard::enter(meters.active_requests.clone(), method_normalized); + + Box::pin(async move { + let result = service.call(req).await; + let elapsed = start.elapsed().as_secs_f64(); + + match &result { + Ok(res) => { + let route = extract_route_from_response(res); + let status = res.status().as_u16(); + let org = res + .request() + .extensions() + .get::() + .map(|o| o.0.clone()); + let ws = res + .request() + .extensions() + .get::() + .map(|w| w.0.clone()); + + let attrs = build_attributes( + method_normalized, + &route, + status, + org.as_deref(), + ws.as_deref(), + &label_cfg, + ); + meters.request_duration.record(elapsed, &attrs); + meters.busy_duration.add( + elapsed, + &[KeyValue::new("http.request.method", method_normalized)], + ); + } + Err(_) => { + // The error converts to a response upstream; record under 500 + // with `error.type=unhandled`. Route is unknown here. + let attrs = build_attributes( + method_normalized, + ROUTE_NOT_FOUND, + 500, + None, + None, + &label_cfg, + ); + meters.request_duration.record(elapsed, &attrs); + } + } + + guard.release(); + result + }) + } +} + +fn extract_route_from_response(res: &ServiceResponse) -> String { + res.request() + .match_pattern() + .unwrap_or_else(|| ROUTE_NOT_FOUND.to_owned()) +} + +/// Newtype wrappers used to read org/workspace from request extensions. +/// Replace these with the real types inserted by `OrgWorkspaceMiddlewareFactory` +/// (verified in Task 9 Step 1) — typically something like +/// `superposition_types::OrgId(pub String)` and `WorkspaceName(pub String)`. +#[derive(Clone)] +pub(crate) struct OrgIdExt(pub String); +#[derive(Clone)] +pub(crate) struct WorkspaceNameExt(pub String); +``` + +**Important — replace `OrgIdExt` / `WorkspaceNameExt` with the real extension types** that `OrgWorkspaceMiddlewareFactory` inserts (verified at Task 9 Step 1). If they live in `superposition_types`, just import them and use them directly. The newtype shims above only exist as a fallback to keep this task buildable in isolation. + +- [ ] **Step 2: Add an end-to-end test for the middleware** + +Append to the test module: + +```rust + use crate::observability::{Observability, ObservabilityConfig}; + use actix_web::App; + use std::time::Duration; + + fn obs_for_test() -> Observability { + let cfg = ObservabilityConfig { + enabled: true, + bind: "127.0.0.1".parse().unwrap(), + port: 0, + label: LabelConfig::default(), + collect_interval: Duration::from_secs(10), + instance_id: "test".into(), + service_name: "sp-test".into(), + service_version: "0".into(), + deployment_environment: None, + otlp_endpoint: None, + }; + Observability::init(cfg).unwrap() + } + + #[actix_web::test] + async fn middleware_records_request_duration() { + let obs = obs_for_test(); + let mw = MetricsMiddleware::new(&obs.meter(), LabelConfig::default()); + let app = test::init_service( + App::new().wrap(mw).route( + "/ping", + web::get().to(|| async { HttpResponse::Ok().body("pong") }), + ), + ) + .await; + + let req = test::TestRequest::get().uri("/ping").to_request(); + let resp = test::call_service(&app, req).await; + assert_eq!(resp.status(), StatusCode::OK); + + let mut buf = Vec::new(); + let metric_families = obs.registry().gather(); + prometheus::Encoder::encode( + &prometheus::TextEncoder::new(), + &metric_families, + &mut buf, + ) + .unwrap(); + let text = String::from_utf8(buf).unwrap(); + assert!(text.contains("http_server_request_duration_seconds_count"), "{text}"); + assert!(text.contains("http_server_busy_duration_seconds_total"), "{text}"); + assert!(text.contains("http_server_active_requests"), "{text}"); + assert!(text.contains("http_route=\"/ping\""), "{text}"); + } +``` + +- [ ] **Step 3: Run the tests** + +Run: `cargo test -p service_utils observability::middleware` +Expected: all middleware tests pass. + +- [ ] **Step 4: Commit** + +```bash +git add crates/service_utils/src/observability/middleware.rs +git commit -m "feat(observability): MetricsMiddleware records HTTP signals + +Wraps every request with timing + active_requests gauge + busy_duration +counter. Uses match_pattern() to template routes, OrgWorkspaceMiddleware +extensions for tenant labels, and an InFlightGuard for panic safety. +" +``` + +--- + +## Task 12: Health endpoints + +**Files:** +- Modify: `crates/service_utils/src/observability/health.rs` + +- [ ] **Step 1: Replace stub with real handlers** + +```rust +//! Health probe endpoints mounted on the main app port. +//! +//! Paths are added to `tenant_middleware_exclusion_list` so they bypass auth. + +use actix_web::{HttpResponse, Scope, web}; + +pub const HEALTHZ: &str = "/healthz"; +pub const LIVEZ: &str = "/livez"; +pub const READYZ: &str = "/readyz"; + +/// Returns the Actix scope to mount on the main app: +/// `App::new().service(observability::health_endpoints())`. +pub fn health_endpoints() -> Scope { + web::scope("") + .route(HEALTHZ, web::get().to(healthz)) + .route(LIVEZ, web::get().to(livez)) + .route(READYZ, web::get().to(readyz)) +} + +/// Paths to add to the auth exclusion list. +pub fn health_endpoint_paths() -> &'static [&'static str] { + &[HEALTHZ, LIVEZ, READYZ] +} + +async fn healthz() -> HttpResponse { + HttpResponse::Ok().content_type("text/plain; charset=utf-8").body("ok") +} + +async fn livez() -> HttpResponse { + HttpResponse::Ok().content_type("text/plain; charset=utf-8").body("ok") +} + +async fn readyz() -> HttpResponse { + // v1: same as livez. Future: check DB pool, Redis, dependencies. + HttpResponse::Ok().content_type("text/plain; charset=utf-8").body("ok") +} + +#[cfg(test)] +mod tests { + use super::*; + use actix_web::{App, http::StatusCode, test}; + + #[actix_web::test] + async fn each_endpoint_returns_200_ok() { + let app = test::init_service(App::new().service(health_endpoints())).await; + for path in health_endpoint_paths() { + let req = test::TestRequest::get().uri(path).to_request(); + let resp = test::call_service(&app, req).await; + assert_eq!(resp.status(), StatusCode::OK, "GET {path}"); + } + } + + #[test] + fn paths_list_matches_routes() { + let paths = health_endpoint_paths(); + assert_eq!(paths, &[HEALTHZ, LIVEZ, READYZ]); + } +} +``` + +- [ ] **Step 2: Run the tests** + +Run: `cargo test -p service_utils observability::health` +Expected: 2 tests pass. + +- [ ] **Step 3: Commit** + +```bash +git add crates/service_utils/src/observability/health.rs +git commit -m "feat(observability): /healthz /livez /readyz handlers" +``` + +--- + +## Task 13: Metrics server (separate `HttpServer` on `SUPERPOSITION_METRICS_PORT`) + +**Files:** +- Modify: `crates/service_utils/src/observability/metrics_server.rs` + +- [ ] **Step 1: Replace stub with real implementation** + +```rust +//! Separate HttpServer that exposes /metrics on SUPERPOSITION_METRICS_PORT. + +use std::{net::SocketAddr, sync::Arc}; + +use actix_web::{App, HttpResponse, HttpServer, dev::Server, web}; +use prometheus::{Encoder, Registry, TextEncoder}; + +/// Spawn an HttpServer on `bind` whose only route is `GET /metrics`. Returns +/// the actix `Server` handle so the caller can `await` it concurrently with +/// the main app. +pub fn spawn_metrics_server( + registry: Arc, + bind: SocketAddr, +) -> std::io::Result { + let registry_data = web::Data::new(registry); + Ok(HttpServer::new(move || { + App::new() + .app_data(registry_data.clone()) + .route("/metrics", web::get().to(scrape)) + }) + .workers(1) + .bind(bind)? + .run()) +} + +async fn scrape(registry: web::Data>) -> HttpResponse { + let encoder = TextEncoder::new(); + let metric_families = registry.gather(); + let mut buf = Vec::new(); + if let Err(e) = encoder.encode(&metric_families, &mut buf) { + return HttpResponse::InternalServerError() + .body(format!("encode error: {e}")); + } + HttpResponse::Ok() + .content_type(encoder.format_type()) + .body(buf) +} + +#[cfg(test)] +mod tests { + use super::*; + use actix_web::{App, http::StatusCode, test}; + + #[actix_web::test] + async fn scrape_endpoint_returns_text_plain() { + let registry = Arc::new(Registry::new()); + let app = test::init_service( + App::new() + .app_data(web::Data::new(registry.clone())) + .route("/metrics", web::get().to(scrape)), + ) + .await; + let req = test::TestRequest::get().uri("/metrics").to_request(); + let resp = test::call_service(&app, req).await; + assert_eq!(resp.status(), StatusCode::OK); + let ct = resp + .headers() + .get("content-type") + .unwrap() + .to_str() + .unwrap(); + assert!(ct.starts_with("text/plain"), "got {ct}"); + } +} +``` + +- [ ] **Step 2: Run the tests** + +Run: `cargo test -p service_utils observability::metrics_server` +Expected: 1 test passes. + +- [ ] **Step 3: Commit** + +```bash +git add crates/service_utils/src/observability/metrics_server.rs +git commit -m "feat(observability): /metrics server on dedicated port" +``` + +--- + +## Task 14: DB pool saturation (r2d2 ObservableGauge callbacks) + +**Files:** +- Create: `crates/service_utils/src/observability/saturation/db_pool.rs` +- Modify: `crates/service_utils/src/observability/saturation.rs` + +- [ ] **Step 1: Confirm pool type** + +Run: `grep -rn "type DbPool\|r2d2::Pool<\|PgPool" /Users/natarajankannan/src/superposition/crates/service_utils/src/db/ /Users/natarajankannan/src/superposition/crates/service_utils/src/service/ 2>/dev/null | head -10` +Expected: shows the concrete `r2d2::Pool<…>` alias used across the codebase. The `DbPoolHandle` type alias below should be set to that exact type (commonly `r2d2::Pool>`). + +- [ ] **Step 2: Create the db_pool module** + +```rust +//! ObservableGauge callbacks for the r2d2 connection pool. Purely passive — +//! no instrumentation at `pool.get()` call sites. + +use opentelemetry::{KeyValue, metrics::Meter}; + +/// Concrete pool type used across the codebase. Update if it differs. +pub type DbPoolHandle = std::sync::Arc< + r2d2::Pool>, +>; + +pub fn register(meter: &Meter, pool: DbPoolHandle, pool_name: &'static str) { + let pool_for_usage = pool.clone(); + let usage_pool_name = KeyValue::new("pool.name", pool_name); + meter + .u64_observable_gauge("db.client.connections.usage") + .with_description("Number of DB connections in idle/used state.") + .with_callback(move |observer| { + let s = pool_for_usage.state(); + let used = s.connections.saturating_sub(s.idle_connections); + observer.observe( + s.idle_connections as u64, + &[ + KeyValue::new("state", "idle"), + usage_pool_name.clone(), + ], + ); + observer.observe( + used as u64, + &[ + KeyValue::new("state", "used"), + usage_pool_name.clone(), + ], + ); + }) + .build(); + + let pool_for_max = pool.clone(); + let max_pool_name = KeyValue::new("pool.name", pool_name); + meter + .u64_observable_gauge("db.client.connections.max") + .with_description("Configured maximum size of the DB connection pool.") + .with_callback(move |observer| { + observer.observe(pool_for_max.max_size() as u64, &[max_pool_name.clone()]); + }) + .build(); +} +``` + +- [ ] **Step 3: Wire it via the saturation entry point** + +Replace `crates/service_utils/src/observability/saturation.rs`: + +```rust +//! Saturation collectors: DB pool, Redis pool, Tokio runtime. +//! +//! Most metrics are observable-gauge callbacks (no background tasks). +//! Only `tokio_runtime` requires a polling loop. + +mod db_pool; +mod redis_pool; +mod tokio_runtime; + +use opentelemetry::metrics::Meter; + +pub use db_pool::DbPoolHandle; + +/// Optional dependencies the saturation subsystem can observe. +#[derive(Default, Clone)] +pub struct SaturationDeps { + pub db_pool: Option, + pub redis_client: Option, + pub tokio_collect_interval: std::time::Duration, +} + +pub fn register_observers( + meter: &Meter, + deps: SaturationDeps, +) -> Result<(), super::ObservabilityError> { + if let Some(pool) = deps.db_pool { + db_pool::register(meter, pool, "primary"); + } + if let Some(client) = deps.redis_client { + redis_pool::register(meter, client, "primary"); + } + + #[cfg(tokio_unstable)] + if deps.tokio_collect_interval > std::time::Duration::ZERO { + tokio_runtime::spawn(meter, deps.tokio_collect_interval); + } + + Ok(()) +} +``` + +- [ ] **Step 4: Verify it compiles** + +Run: `cargo check -p service_utils` +Expected: exit code 0. Failures here are usually: +- The `DbPoolHandle` alias doesn't match the codebase's actual pool type → adjust to whatever Step 1 found. +- `redis_pool` module doesn't exist yet → that's Task 15. Create an empty stub now: `crates/service_utils/src/observability/saturation/redis_pool.rs` with: + ```rust + //! Stub — implemented in Task 15. + use opentelemetry::metrics::Meter; + pub type RedisHandle = std::sync::Arc<()>; + pub fn register(_meter: &Meter, _client: RedisHandle, _pool_name: &'static str) {} + ``` +- Same for `tokio_runtime` (Task 16). Stub: `crates/service_utils/src/observability/saturation/tokio_runtime.rs` with: + ```rust + //! Stub — implemented in Task 16. + #[cfg(tokio_unstable)] + pub fn spawn(_meter: &opentelemetry::metrics::Meter, _interval: std::time::Duration) {} + ``` + +- [ ] **Step 5: Add a smoke test** + +Append to `db_pool.rs`: + +```rust +#[cfg(test)] +mod tests { + // Constructing a real r2d2 pool requires a database. We assert the function + // signature compiles and that calling `register` does not panic with a + // synthetic in-memory pool; this is exercised via the integration test in + // Task 18 instead. +} +``` + +- [ ] **Step 6: Commit** + +```bash +git add crates/service_utils/src/observability/saturation.rs crates/service_utils/src/observability/saturation/ +git commit -m "feat(observability): db pool saturation gauges + +ObservableGauge callbacks read r2d2::Pool::state() at scrape time. +Emits db.client.connections.usage{state} and db.client.connections.max. +" +``` + +--- + +## Task 15: Redis pool saturation (fred metrics) + +**Files:** +- Modify: `crates/service_utils/src/observability/saturation/redis_pool.rs` + +- [ ] **Step 1: Find fred client type and metrics surface** + +Run: `grep -rn "fred::\|RedisClient\|Pool" /Users/natarajankannan/src/superposition/crates/service_utils/src/redis* 2>/dev/null | head -10` +Expected: shows the concrete fred client type (likely `fred::clients::RedisClient` or `fred::clients::RedisPool`). + +Also run: `cargo doc -p fred --no-deps --open` *(or browse https://docs.rs/fred/latest/fred/)* and locate the metrics API. fred 9.x exposes per-client `read_latency_metrics()` / `write_latency_metrics()` and connection counters via `Server` / `Stats` types. Pin the names you actually find. + +- [ ] **Step 2: Replace stub** + +```rust +//! Saturation gauges for the Redis client pool (fred crate). +//! +//! fred's `metrics` feature exposes per-client / per-pool stats. The +//! callbacks below are intentionally tolerant: if a stat is unavailable +//! in the version we use, the metric is simply not emitted (a TODO is +//! left at the call site). + +use std::sync::Arc; + +use opentelemetry::{KeyValue, metrics::Meter}; + +/// Wraps whatever fred client/pool type the rest of `service_utils` uses. +/// Update the inner type to match `crate::redis`'s public surface. +pub type RedisHandle = Arc; + +/// Tiny abstraction so the metrics module doesn't have to know fred's +/// concrete types. Implement on the wrapper that `crate::redis` already +/// hands around. +pub trait RedisStats { + fn idle_connections(&self) -> Option; + fn used_connections(&self) -> Option; + fn commands_in_flight(&self) -> Option; +} + +pub fn register(meter: &Meter, client: RedisHandle, pool_name: &'static str) { + let usage_label = KeyValue::new("pool.name", pool_name); + + let c = client.clone(); + let label = usage_label.clone(); + meter + .u64_observable_gauge("redis.client.connections.usage") + .with_description("Number of Redis connections in idle/used state.") + .with_callback(move |observer| { + if let Some(idle) = c.idle_connections() { + observer.observe( + idle, + &[KeyValue::new("state", "idle"), label.clone()], + ); + } + if let Some(used) = c.used_connections() { + observer.observe( + used, + &[KeyValue::new("state", "used"), label.clone()], + ); + } + }) + .build(); + + let c = client.clone(); + let label = usage_label.clone(); + meter + .u64_observable_gauge("redis.client.commands.in_flight") + .with_description("Number of Redis commands currently in flight.") + .with_callback(move |observer| { + if let Some(n) = c.commands_in_flight() { + observer.observe(n, &[label.clone()]); + } + }) + .build(); +} +``` + +- [ ] **Step 3: Wire `RedisStats` to whatever fred client you use** + +Find the wrapper in `crate::redis` (the internal Redis surface), and `impl RedisStats for YourWrapper` using the fred metrics surface confirmed in Step 1. If a particular field is unavailable in your fred version, leave the impl returning `None` and a `// TODO(observability): expose when fred …` comment. + +This step is intentionally light on prescriptive code because the exact fred API depends on the pinned version. The contract is just three `Option` getters; all three returning `None` is acceptable for v1 — the metrics simply won't have data. + +- [ ] **Step 4: Verify it compiles** + +Run: `cargo check -p service_utils` +Expected: exit code 0. + +- [ ] **Step 5: Commit** + +```bash +git add crates/service_utils/src/observability/saturation/redis_pool.rs crates/service_utils/src/redis* +git commit -m "feat(observability): redis pool saturation gauges + +ObservableGauge callbacks read fred client/pool stats via a thin +RedisStats trait. Tolerant to missing fields — a None return simply +omits the metric. +" +``` + +--- + +## Task 16: Tokio runtime saturation (`cfg(tokio_unstable)`) + +**Files:** +- Modify: `crates/service_utils/src/observability/saturation/tokio_runtime.rs` + +- [ ] **Step 1: Replace stub** + +```rust +//! Tokio runtime saturation, gated on `cfg(tokio_unstable)`. +//! +//! Unlike DB/Redis, `tokio_metrics::RuntimeMonitor` is delta-based: each +//! `.intervals()` call returns stats since the last call. So we run a +//! background task that samples every `interval` and stores derived values +//! in atomics that observable-gauge callbacks read. + +#[cfg(not(tokio_unstable))] +pub fn spawn(_meter: &opentelemetry::metrics::Meter, _interval: std::time::Duration) {} + +#[cfg(tokio_unstable)] +mod inner { + use std::sync::Arc; + use std::sync::atomic::{AtomicU64, Ordering}; + use std::time::Duration; + + use opentelemetry::metrics::Meter; + use tokio_metrics::RuntimeMonitor; + + #[derive(Default)] + struct Snapshot { + workers: AtomicU64, + global_queue_depth: AtomicU64, + busy_ratio_milli: AtomicU64, // busy_ratio * 1000, stored as integer + } + + pub fn spawn(meter: &Meter, interval: Duration) { + let handle = match tokio::runtime::Handle::try_current() { + Ok(h) => h, + Err(_) => return, // not running on a tokio runtime; no-op + }; + let snap = Arc::new(Snapshot::default()); + + // Background sampler. + let snap_for_task = snap.clone(); + tokio::spawn(async move { + let monitor = RuntimeMonitor::new(&handle); + let mut intervals = monitor.intervals(); + loop { + if let Some(m) = intervals.next() { + snap_for_task + .workers + .store(m.workers_count as u64, Ordering::Relaxed); + snap_for_task + .global_queue_depth + .store(m.global_queue_depth as u64, Ordering::Relaxed); + let busy = m.total_busy_duration.as_secs_f64(); + let total = (m.total_polls_count as f64).max(1.0) + * interval.as_secs_f64() + * (m.workers_count as f64).max(1.0); + let ratio = (busy / total).clamp(0.0, 1.0); + snap_for_task + .busy_ratio_milli + .store((ratio * 1000.0) as u64, Ordering::Relaxed); + } + tokio::time::sleep(interval).await; + } + }); + + // ObservableGauges read from the snapshot atomics. + let s = snap.clone(); + meter + .u64_observable_gauge("runtime.tokio.workers") + .with_callback(move |observer| { + observer.observe(s.workers.load(Ordering::Relaxed), &[]); + }) + .build(); + + let s = snap.clone(); + meter + .u64_observable_gauge("runtime.tokio.global_queue.depth") + .with_callback(move |observer| { + observer.observe(s.global_queue_depth.load(Ordering::Relaxed), &[]); + }) + .build(); + + let s = snap.clone(); + meter + .f64_observable_gauge("runtime.tokio.workers.busy_ratio") + .with_callback(move |observer| { + let milli = s.busy_ratio_milli.load(Ordering::Relaxed); + observer.observe(milli as f64 / 1000.0, &[]); + }) + .build(); + } +} + +#[cfg(tokio_unstable)] +pub use inner::spawn; +``` + +- [ ] **Step 2: Verify it compiles both with and without the cfg** + +Run: `cargo check -p service_utils` +Expected: exit code 0 (the workspace `.cargo/config.toml` enables `tokio_unstable`, so the `inner` branch compiles). + +Run: `RUSTFLAGS="" cargo check -p service_utils` +Expected: exit code 0 (the `not(tokio_unstable)` no-op stub compiles when the flag is off). + +If `tokio_metrics::RuntimeMonitor::intervals()` field names differ from those used above (`workers_count`, `global_queue_depth`, `total_busy_duration`, `total_polls_count`), adjust to match. The fundamental shape — `.intervals()` returning a delta iterator — is stable across recent versions. + +- [ ] **Step 3: Commit** + +```bash +git add crates/service_utils/src/observability/saturation/tokio_runtime.rs +git commit -m "feat(observability): tokio runtime saturation gauges + +Background sampler updates atomic snapshots; observable gauges read +from the snapshot. Gated on cfg(tokio_unstable); compiles to a no-op +when the flag is disabled. +" +``` + +--- + +## Task 17: Add health paths to `tenant_middleware_exclusion_list` + +**Files:** +- Modify: `crates/superposition/src/app_state.rs` + +- [ ] **Step 1: Make health paths always-excluded** + +Replace lines 101–107 of `crates/superposition/src/app_state.rs` (the `tenant_middleware_exclusion_list` field assignment). Use `Read` first to confirm the surrounding context, then `Edit`: + +```rust + tenant_middleware_exclusion_list: { + let mut set = get_from_env_unsafe::( + "TENANT_MIDDLEWARE_EXCLUSION_LIST", + ) + .expect("TENANT_MIDDLEWARE_EXCLUSION_LIST is not set") + .split(',') + .map(String::from) + .collect::>(); + // Always exclude observability health endpoints from auth checks. + set.extend( + service_utils::observability::health_endpoint_paths() + .iter() + .map(|s| s.to_string()), + ); + set + }, +``` + +- [ ] **Step 2: Verify it compiles** + +Run: `cargo check -p superposition` +Expected: exit code 0. + +- [ ] **Step 3: Commit** + +```bash +git add crates/superposition/src/app_state.rs +git commit -m "feat: exclude /healthz /livez /readyz from auth checks + +Adds the observability health paths to tenant_middleware_exclusion_list +so probes do not trigger auth flow. Operators no longer need to remember +to put these in TENANT_MIDDLEWARE_EXCLUSION_LIST. +" +``` + +--- + +## Task 18: Wire observability into `main.rs` + +**Files:** +- Modify: `crates/superposition/src/main.rs` + +- [ ] **Step 1: Add imports + early init** + +At the top of `main()` in `crates/superposition/src/main.rs` (after tracing init, before app_state construction), add: + +```rust + use service_utils::observability::{ + self, MetricsMiddleware, Observability, ObservabilityConfig, SaturationDeps, + }; + + let obs_cfg = ObservabilityConfig::from_env() + .expect("invalid observability env config"); + let obs_enabled = obs_cfg.enabled; + let observability = if obs_enabled { + Some(Observability::init(obs_cfg.clone()).expect("observability init failed")) + } else { + None + }; + let metrics_meter = observability.as_ref().map(|o| o.meter()); + let metrics_label_cfg = obs_cfg.label; +``` + +- [ ] **Step 2: Register saturation observers** + +After `app_state` is built (so the DB pool is available) and inside the tokio runtime context, add: + +```rust + if let (Some(obs), Some(pool)) = (observability.as_ref(), Some(app_state.db_pool.clone())) { + observability::register_observers( + &obs.meter(), + SaturationDeps { + db_pool: Some(pool), + redis_client: app_state.redis_client.clone().map(Into::into), + tokio_collect_interval: obs_cfg.collect_interval, + }, + ) + .expect("saturation observer registration failed"); + } +``` + +(Adjust `app_state.db_pool` and `app_state.redis_client` to whatever fields actually exist on `AppState`. If the redis client isn't easily Arc-wrapped, pass `None` and leave the metric unattached for now.) + +- [ ] **Step 3: Spawn the metrics server** + +After the saturation registration, add: + +```rust + let metrics_server_handle = if let Some(obs) = observability.as_ref() { + let bind: std::net::SocketAddr = format!("{}:{}", obs_cfg.bind, obs_cfg.port) + .parse() + .expect("invalid metrics bind addr"); + Some(observability::spawn_metrics_server(obs.registry(), bind)?) + } else { + None + }; +``` + +- [ ] **Step 4: Add the middleware to the App builder** + +In the `HttpServer::new(move || App::new()…)` closure, add the middleware as the *outermost* `.wrap()` (i.e., the *last* `.wrap()` in the chain — Actix runs the last-wrapped middleware first): + +```rust + .service(observability::health_endpoints()) + // ... existing .service() and .wrap() calls (auth_z, auth_n, ... + // RequestResponseLogger, TracingLogger) ... + .wrap(actix_web::middleware::Condition::new( + obs_enabled, + metrics_meter + .as_ref() + .map(|m| MetricsMiddleware::new(m, metrics_label_cfg)) + .unwrap_or_else(|| MetricsMiddleware::new( + // construct a no-op meter for the disabled case + &opentelemetry::global::meter("noop"), + metrics_label_cfg, + )), + )) + .wrap(TracingLogger::::new()) +``` + +The `Condition` wrapper makes the middleware a no-op when `SUPERPOSITION_METRICS_ENABLED=false`. Match the `&meter` borrow shape that `MetricsMiddleware::new` expects. + +- [ ] **Step 5: Run both servers concurrently** + +Replace the final `.run().await` with a `try_join!` over the main and metrics servers. Example: + +```rust + let main_server = HttpServer::new(/* ... */) + .bind(("0.0.0.0", cac_port))? + .workers(get_from_env_or_default("ACTIX_WORKER_COUNT", 5)) + .keep_alive(Duration::from_secs( + get_from_env_unsafe("ACTIX_KEEP_ALIVE").unwrap_or(120), + )) + .run(); + + match metrics_server_handle { + Some(metrics) => { + futures_util::try_join!(main_server, metrics)?; + } + None => { + main_server.await?; + } + } +``` + +(`futures_util` is already a workspace dep — see root `Cargo.toml`.) + +- [ ] **Step 6: Build the binary** + +Run: `cargo build -p superposition` +Expected: exit code 0. Compilation errors here are the hardest part of the wiring; iterate on imports and types until clean. + +- [ ] **Step 7: Smoke-test locally** + +Start the binary against the local docker-compose dev stack: + +```bash +make run # or whatever the makefile target is +``` + +In another shell: + +```bash +curl -s -i http://localhost:8080/healthz +curl -s http://localhost:9091/metrics | head -50 +``` + +Expected: + +- `/healthz` returns `200 OK` with body `ok`. +- `/metrics` returns Prometheus exposition that includes lines starting with `# HELP http_server_request_duration_seconds`, `http_server_active_requests`, `http_server_busy_duration_seconds_total`, and (after issuing a few API requests) `http_server_request_duration_seconds_bucket{...}` lines with `http_route` labels. + +Stop the binary. + +- [ ] **Step 8: Commit** + +```bash +git add crates/superposition/src/main.rs +git commit -m "feat: wire observability into main binary + +- Init Observability early (Prometheus exporter + optional OTLP push) +- Spawn metrics server on SUPERPOSITION_METRICS_PORT +- Register DB/Redis/Tokio saturation observers +- Wrap App with MetricsMiddleware (gated by SUPERPOSITION_METRICS_ENABLED) +- Mount /healthz /livez /readyz on the main app +- try_join! both servers so the process exits if either dies +" +``` + +--- + +## Task 19: Integration test — full pipeline through `/metrics` + +**Files:** +- Create: `crates/service_utils/tests/observability_integration.rs` + +- [ ] **Step 1: Write the test** + +```rust +//! End-to-end test: an Actix app wrapped with MetricsMiddleware serves several +//! routes; we then issue requests and parse the Prometheus scrape output to +//! assert on the metrics that should appear. + +use actix_web::{App, HttpResponse, http::StatusCode, test, web}; +use prometheus::Encoder; +use service_utils::observability::{ + LabelConfig, MetricsMiddleware, Observability, ObservabilityConfig, +}; + +fn cfg() -> ObservabilityConfig { + ObservabilityConfig { + enabled: true, + bind: "127.0.0.1".parse().unwrap(), + port: 0, + label: LabelConfig::default(), + collect_interval: std::time::Duration::from_secs(10), + instance_id: "it".into(), + service_name: "sp-it".into(), + service_version: "0".into(), + deployment_environment: None, + otlp_endpoint: None, + } +} + +fn scrape(obs: &Observability) -> String { + let metric_families = obs.registry().gather(); + let mut buf = Vec::new(); + prometheus::TextEncoder::new() + .encode(&metric_families, &mut buf) + .unwrap(); + String::from_utf8(buf).unwrap() +} + +#[actix_web::test] +async fn metrics_appear_after_requests() { + let obs = Observability::init(cfg()).unwrap(); + let mw = MetricsMiddleware::new(&obs.meter(), LabelConfig::default()); + let app = test::init_service( + App::new() + .wrap(mw) + .route("/ping", web::get().to(|| async { HttpResponse::Ok() })) + .route( + "/echo/{name}", + web::post().to(|p: web::Path| async move { + HttpResponse::Created().body(p.into_inner()) + }), + ) + .route( + "/boom", + web::get().to(|| async { HttpResponse::InternalServerError() }), + ), + ) + .await; + + for _ in 0..3 { + let req = test::TestRequest::get().uri("/ping").to_request(); + let resp = test::call_service(&app, req).await; + assert_eq!(resp.status(), StatusCode::OK); + } + let req = test::TestRequest::post().uri("/echo/world").to_request(); + let resp = test::call_service(&app, req).await; + assert_eq!(resp.status(), StatusCode::CREATED); + + let req = test::TestRequest::get().uri("/boom").to_request(); + let resp = test::call_service(&app, req).await; + assert_eq!(resp.status(), StatusCode::INTERNAL_SERVER_ERROR); + + let req = test::TestRequest::get().uri("/no-such-route").to_request(); + let resp = test::call_service(&app, req).await; + assert_eq!(resp.status(), StatusCode::NOT_FOUND); + + let body = scrape(&obs); + + // Request duration histogram exists with expected labels for /ping (3 hits). + let ping_count_line = body + .lines() + .find(|l| { + l.starts_with("http_server_request_duration_seconds_count{") + && l.contains("http_route=\"/ping\"") + && l.contains("http_request_method=\"GET\"") + && l.contains("http_response_status_code=\"200\"") + }) + .unwrap_or_else(|| panic!("no /ping count line in:\n{body}")); + let ping_count: f64 = ping_count_line + .rsplit_once(' ') + .unwrap() + .1 + .trim() + .parse() + .unwrap(); + assert_eq!(ping_count as u64, 3); + + // 5xx series for /boom appears. + assert!( + body.lines().any(|l| { + l.starts_with("http_server_request_duration_seconds_count{") + && l.contains("http_route=\"/boom\"") + && l.contains("http_response_status_code=\"500\"") + }), + "no /boom 500 series in:\n{body}" + ); + + // Unmatched path uses the sentinel. + assert!( + body.lines().any(|l| { + l.starts_with("http_server_request_duration_seconds_count{") + && l.contains("http_route=\"__not_found__\"") + }), + "no __not_found__ series in:\n{body}" + ); + + // busy_duration_total > 0 + let busy = body + .lines() + .find(|l| l.starts_with("http_server_busy_duration_seconds_total{")) + .unwrap_or_else(|| panic!("no busy_duration line in:\n{body}")); + let busy_value: f64 = busy.rsplit_once(' ').unwrap().1.trim().parse().unwrap(); + assert!(busy_value > 0.0, "expected busy_duration > 0, got {busy_value}"); + + // active_requests returns to 0 after all requests complete. + let active_lines: Vec<_> = body + .lines() + .filter(|l| l.starts_with("http_server_active_requests{")) + .collect(); + for line in &active_lines { + let v: f64 = line.rsplit_once(' ').unwrap().1.trim().parse().unwrap(); + assert_eq!(v, 0.0, "active_requests not zero: {line}"); + } +} +``` + +- [ ] **Step 2: Run the test** + +Run: `cargo test -p service_utils --test observability_integration` +Expected: 1 test passes. + +- [ ] **Step 3: Commit** + +```bash +git add crates/service_utils/tests/observability_integration.rs +git commit -m "test(observability): end-to-end integration + +Wraps a small App with MetricsMiddleware, issues requests of various +shapes (200, 201, 500, 404), and asserts on the parsed Prometheus +exposition: per-route counts, the 5xx series, the __not_found__ +sentinel, busy_duration > 0, and active_requests returning to 0. +" +``` + +--- + +## Task 20: Cardinality regression test + +**Files:** +- Modify: `crates/service_utils/tests/observability_integration.rs` + +- [ ] **Step 1: Add the test** + +Append to the integration test file: + +```rust +#[actix_web::test] +async fn cardinality_stays_within_budget() { + let obs = Observability::init(cfg()).unwrap(); + let mw = MetricsMiddleware::new(&obs.meter(), LabelConfig::default()); + let app = test::init_service( + App::new() + .wrap(mw) + .route("/a", web::get().to(|| async { HttpResponse::Ok() })) + .route("/b", web::get().to(|| async { HttpResponse::Ok() })) + .route("/c", web::post().to(|| async { HttpResponse::Created() })), + ) + .await; + + for _ in 0..10 { + for path in &["/a", "/b"] { + let req = test::TestRequest::get().uri(path).to_request(); + let _ = test::call_service(&app, req).await; + } + let req = test::TestRequest::post().uri("/c").to_request(); + let _ = test::call_service(&app, req).await; + } + + let body = scrape(&obs); + let series = body + .lines() + .filter(|l| !l.is_empty() && !l.starts_with('#')) + .count(); + + // Budget for this scenario: 3 routes × 1 method each × 1 status × ~12 + // (10 buckets + sum + count) = ~36 series for the histogram, plus 3 for + // busy_duration, plus 1 for active_requests, plus a few from `target_info` + // that the prometheus exporter emits. Headroom: 200. + assert!(series <= 200, "cardinality regression: {series} series\n{body}"); +} +``` + +- [ ] **Step 2: Run the test** + +Run: `cargo test -p service_utils --test observability_integration` +Expected: 2 tests pass. + +- [ ] **Step 3: Commit** + +```bash +git add crates/service_utils/tests/observability_integration.rs +git commit -m "test(observability): cardinality regression budget + +Asserts that a 3-route × 1-method × 1-status scenario produces no more +than 200 series, catching accidental high-cardinality labels in review. +" +``` + +--- + +## Task 21: Update README + makefile note + +**Files:** +- Modify: `README.md` +- Modify: `makefile` + +- [ ] **Step 1: Add a section to README** + +Find the build/development section and append: + +````markdown +### Metrics & observability + +The HTTP API exposes Prometheus metrics on `SUPERPOSITION_METRICS_PORT` (default `9091`): + +``` +curl http://localhost:9091/metrics +``` + +Health endpoints live on the main port: `GET /healthz`, `/livez`, `/readyz`. + +For full details (labels, cardinality, OTLP push), see +[`docs/superpowers/specs/2026-05-10-otel-golden-signals-middleware-design.md`](docs/superpowers/specs/2026-05-10-otel-golden-signals-middleware-design.md). + +**Note on `tokio_unstable`.** The workspace's `.cargo/config.toml` enables +`--cfg tokio_unstable` so `tokio-metrics` can collect runtime saturation. This +flag only adds APIs; no behavioural change for existing code. Contributors who +build outside `cargo` (e.g., custom IDE invocations) should pass the same flag, +or accept that the `runtime.tokio.*` metrics will be absent. +```` + +- [ ] **Step 2: Sanity-check the makefile** + +Read the makefile's `build`/`run` targets. If they invoke `cargo` plainly, no change is needed (the `.cargo/config.toml` is picked up automatically). If they set `RUSTFLAGS=` explicitly anywhere, ensure `--cfg tokio_unstable` is preserved. + +- [ ] **Step 3: Commit** + +```bash +git add README.md makefile +git commit -m "docs: note metrics endpoints and tokio_unstable build flag" +``` + +--- + +## Task 22: Final smoke-test pass + +**Files:** none (verification only) + +- [ ] **Step 1: Full test suite** + +Run: `cargo test --workspace` +Expected: all tests pass (including the integration test from Task 19). + +- [ ] **Step 2: Build with metrics disabled** + +Run: + +```bash +SUPERPOSITION_METRICS_ENABLED=false cargo build -p superposition +``` + +Expected: builds cleanly. + +- [ ] **Step 3: Build without `tokio_unstable`** + +Run: + +```bash +RUSTFLAGS="" cargo check -p service_utils +``` + +Expected: builds cleanly (the `not(tokio_unstable)` no-op stub is exercised). + +- [ ] **Step 4: Live smoke** + +Bring up the dev stack and verify metrics flow into Prometheus / VictoriaMetrics: + +```bash +docker compose -f grafana/docker-compose.yaml up -d +make run # or whatever the makefile target is +``` + +Add a scrape target to `grafana/prometheus.yml` for `host.docker.internal:9091` (Mac) or the host IP (Linux), reload Prometheus (`docker compose restart prometheus`), and verify in the Prometheus UI's Targets page that the new target is `UP`. Query `http_server_request_duration_seconds_count` and confirm series with `http_route` labels appear after issuing a few requests. + +Stop the dev stack. + +- [ ] **Step 5: Commit (if any docs changed during smoke)** + +If the Prometheus scrape config got a new entry, commit it: + +```bash +git add grafana/prometheus.yml +git commit -m "chore(grafana): scrape superposition metrics endpoint" +``` + +Otherwise, no commit. + +--- + +## Notes for self-review (already incorporated) + +- **Spec coverage.** Every section of the spec maps to a task: §5 architecture → Tasks 4, 18; §6 module structure → Tasks 4, 11, 12, 13, 14, 15, 16; §7 dependencies → Tasks 1, 3; §8 metric definitions → Tasks 7, 11, 14, 15, 16; §9 middleware mechanics → Tasks 6, 8, 9, 10, 11; §10 saturation collectors → Tasks 14, 15, 16; §11 configuration → Task 5; §12 testing strategy → Tasks 19, 20; §13 rollout — handled at deployment time, not in code (env var defaults); §14 future work — explicitly out of scope. +- **Type consistency.** `HttpMeters` (Task 7), `ObservabilityConfig`/`LabelConfig` (Task 5), `Observability` (Tasks 4, 7), and the helper functions in `middleware.rs` (Tasks 6–11) all use consistent names across tasks. +- **Build-flag duality.** Task 16 explicitly tests both with and without `tokio_unstable`. Task 22 retests this at the end as a regression check. +- **Auth bypass mechanism.** Task 17 wires the health paths into the existing `tenant_middleware_exclusion_list` machinery (verified in Task spec §5.2 against `crates/service_utils/src/middlewares/auth_n.rs:44–60`), not the incorrect "register before auth_n" pattern that was in an earlier draft of the spec. diff --git a/docs/superpowers/specs/2026-05-10-otel-golden-signals-middleware-design.md b/docs/superpowers/specs/2026-05-10-otel-golden-signals-middleware-design.md new file mode 100644 index 000000000..a511c0841 --- /dev/null +++ b/docs/superpowers/specs/2026-05-10-otel-golden-signals-middleware-design.md @@ -0,0 +1,526 @@ +# OpenTelemetry Golden-Signals Middleware + +- **Date:** 2026-05-10 +- **Status:** Design — shipped with deviations (see §0) +- **Owner:** Natarajan Kannan +- **Target crate:** `service_utils` +- **Reference TSDB:** VictoriaMetrics (single-node `vmsingle`); design is TSDB-agnostic + +## 0. Post-implementation deviations + +The PR shipped with the following changes versus the design captured below. The body of this document is preserved as the original design rationale. + +- **Health endpoints (`/healthz`, `/livez`, `/readyz`) dropped.** The pre-existing `GET /health` already serves the up-check role; the k8s-conventional liveness/readiness split can be added in a follow-up PR when an actual deployment consumes it. This makes §5.2 ("Auth bypass for health endpoints"), the `health_endpoints()` API in §6, and Task 12/17 of the plan obsolete. +- **`tokio_unstable` flag, `tokio-metrics` dep, and `.cargo/config.toml` removed.** Tokio 1.50 exposes `Handle::metrics().num_workers()`, `.global_queue_depth()`, and `.worker_total_busy_duration(i)` as stable APIs (the last gated on `target_has_atomic = "64"`, like tokio itself does). `saturation::tokio_runtime` reads `Handle::metrics()` directly inside each observable callback — no background sampler, no `RuntimeMonitor`, no atomics snapshot. +- **`runtime.tokio.workers.busy_ratio` replaced with `runtime.tokio.workers.busy.time`.** Exposes cumulative busy time in seconds as a monotonic OTel Counter (summed across workers); Prometheus computes saturation via `rate(...) / num_workers` at query time. Same semantic, Prom-idiomatic. +- **`opentelemetry-semantic-conventions` dependency removed.** The handful of attribute names we use are inlined as string literals. +- **`SaturationDeps::tokio_collect_interval` field removed.** No background sampler → no interval to configure. `SUPERPOSITION_METRICS_COLLECT_INTERVAL` still controls the OTLP periodic-reader cadence. +- **`tenant_middleware_exclusion_list` reverted to env-only.** With health endpoints removed, there's no need to extend it programmatically. + +## 1. Background + +Superposition's Actix-web HTTP API has structured tracing via `tracing-actix-web` and a `RequestResponseLogger` middleware that emits a single `info!(latency = …, "GoldenSignal")` log line per request. There is no Prometheus/OpenTelemetry client, no `/metrics` endpoint, and no per-process gauge for in-flight work, DB pool state, or runtime saturation. The repository ships a `grafana/` directory with Prometheus + Grafana docker-compose and a Python `custom-exporter`, but no application metrics flow through it. + +This design adds first-class metrics exposition for the four [Google SRE golden signals](https://sre.google/sre-book/monitoring-distributed-systems/) — **latency, traffic, errors, saturation** — using OpenTelemetry, with VictoriaMetrics as the reference scrape target. Instrumentation is applied via Actix middleware so any existing or future API endpoint is covered automatically. + +## 2. Goals + +1. Expose Prometheus-format metrics for every HTTP route on the main API, covering latency / traffic / errors / saturation, without per-handler code changes. +2. Emit OpenTelemetry semantic-convention metric names so any OTel-native backend (VictoriaMetrics, Prometheus, Grafana Mimir, SigNoz, OpenObserve, Datadog, Honeycomb, …) can ingest them. +3. Support both **pull** (Prometheus scrape on a dedicated port) and **push** (OTLP HTTP/gRPC) exposition; users choose at deployment time via standard OTel env vars. +4. Capture saturation signals beyond HTTP — DB connection pool, Redis connection pool, Tokio runtime — so a real "is this process overloaded?" view is possible. +5. Keep the per-request overhead low (single-digit microseconds) and keep cardinality bounded by design. +6. Provide a clean configuration surface so operators can disable high-cardinality labels (notably `workspace_id`) without code changes. + +## 3. Non-goals + +- **Trace correlation via exemplars.** Linking percentile spikes to specific traces requires `tracing-opentelemetry` to be wired through the existing tracing setup. Out of scope for this spec; a follow-up. +- **OTLP traces export.** The same SDK init code is structured to host trace export later, but this spec covers metrics only. +- **Per-tenant separate histograms.** Considered (option D in §11.1) and deferred until the global histogram's cardinality budget proves tight in production. +- **Grafana dashboards.** A separate PR will commit JSON dashboards under `grafana/dashboards/` covering the four golden-signal panels. +- **Alert rules.** A separate PR will commit VM/Prometheus alert rule YAML. +- **Removing the existing `info!(… "GoldenSignal")` log.** Stays for now; downstream tooling may consume it. Marked for removal once dashboards have migrated. +- **Instrumenting non-HTTP work** (background jobs, DB query timing per query). Out of scope for v1. + +## 4. Decisions summary + +| Decision | Choice | Rationale | +|---|---|---| +| Client library | OpenTelemetry SDK + Prometheus exporter | TSDB-agnostic; future-proof for OSS users; future-proofs unified traces+metrics. | +| TSDB (reference) | VictoriaMetrics (`vmsingle`) | Cheap to operate; Prom-compatible; cluster path exists if needed. | +| Exposition transport | Prometheus scrape on dedicated port + optional OTLP push | Pull-by-default for self-hosted users; OTLP path unlocks every OTel-native backend. | +| Labels on HTTP metrics | `route × method × status × org × workspace` | Tenant-level slicing in metrics; workspace label is env-disable-able for users with very high workspace counts. | +| Saturation signals | HTTP active requests + HTTP busy duration + Tokio runtime + DB pool + Redis pool | Multiple independent signals avoid single-metric blind spots. Host-level (CPU/mem/FD) stays with `node-exporter`. | +| Where `/metrics` lives | Separate listener on `SUPERPOSITION_METRICS_PORT` (default `9091`) | Network-policy isolation; scrape requests don't pollute the app's own metrics; no auth interaction. | +| Where `/healthz` lives | Main app port `8080`, paths added to `auth_n`'s exception set | Probes exercise the real user-facing port. | +| Module location | `crates/service_utils/src/observability.rs` (+ `observability/` for submodules, no `mod.rs`) | Matches existing convention for cross-cutting concerns; modern Rust 2018+ module layout. | +| Build config | `.cargo/config.toml` adds `--cfg tokio_unstable` workspace-wide | Required by `tokio-metrics` runtime instrumentation. | + +## 5. Architecture + +A new module **`service_utils::observability`** owns three pieces: + +1. **`init()`** — called once from `main.rs` early in startup. Builds the OTel `MeterProvider` with two readers: a `PrometheusExporter` (renders to `/metrics`) and (if `OTEL_EXPORTER_OTLP_ENDPOINT` is set) a periodic OTLP push exporter. Returns an `Observability` handle owning the registry, a cloned `Meter`, and shutdown hooks. + +2. **`MetricsMiddleware`** — Actix `Transform`/`Service` pair wrapping every request on the main server. Records: + - `http.server.request.duration` (histogram, seconds) + - `http.server.busy.duration` (counter, seconds) + - `http.server.active_requests` (UpDownCounter) + +3. **`saturation::*`** — observable-gauge callbacks (no background tasks for r2d2 / fred) plus one `tokio::spawn` for `tokio-metrics` runtime polling. All emit OTel-namespaced metrics. + +A second component, **`metrics_server`**, is a separate `actix_web::HttpServer` on `SUPERPOSITION_METRICS_PORT` that exposes: + +- `GET /metrics` — Prometheus exposition rendered from the OTel registry + +The main server (port `8080`) gets one new `.wrap(MetricsMiddleware::new(meter.clone()))` line and three new route registrations for `/healthz`, `/livez`, `/readyz`. + +### 5.1 Data flow + +```text +[request on :8080] + ├── tracing-actix-web ─→ span + ├── (auth_n / auth_z) ─→ extensions: org_id, workspace_id + ├── MetricsMiddleware ─→ start timer, inc active_requests (RAII guard) + │ └── handler runs + └── MetricsMiddleware ─→ record histogram, add busy_duration, dec active_requests, emit attributes + +[scrape on :9091/metrics] ←── PrometheusExporter ←── MeterProvider ←── (HTTP middleware + saturation collectors) + └─→ (optional) OTLP HTTP/gRPC push to OTEL_EXPORTER_OTLP_ENDPOINT + +[saturation, callback-driven] + ObservableGauge.with_callback(|obs| obs.observe(pool.state(), …)) // r2d2, fred + ObservableGauge reads from AtomicU64 written by a 10s tokio::spawn // tokio-metrics +``` + +### 5.2 Middleware ordering (critical) + +`MetricsMiddleware` must run *outside* `auth_n` / `auth_z` / `OrgWorkspaceMiddlewareFactory` so that, when emitting metrics in the response phase, it can read `org_id` / `workspace_id` from request extensions. In Actix, the last `.wrap()` runs first on requests, so the registration chain in `main.rs` should look like (matching the existing convention noted at lines 204–219 of `main.rs`): + +```rust +App::new() + .service(/* main api scopes */) + .service(health_endpoints()) // /healthz /livez /readyz + // Auth innermost so outer middlewares still run on auth failures. + .wrap(auth_z.clone()) + .wrap(auth_n.clone()) + .wrap(/* DefaultHeaders, Compress as today */) + .wrap(RequestResponseLogger) + .wrap(MetricsMiddleware::new(meter.clone(), label_cfg)) // observability — outermost wrap + .wrap(TracingLogger::::new()) // outermost: span covers everything +``` + +**Auth bypass for health endpoints.** `auth_n` (`crates/service_utils/src/middlewares/auth_n.rs:44–60`) returns `Login::None` when the matched path is in its exception set. The existing `/health` route uses this mechanism. The new `/healthz`, `/livez`, `/readyz` paths are added to the same exception set construction site (the call site that builds the `HashSet` passed into `auth_n`). With the exception in place, requests to health endpoints traverse all the middlewares above (so `MetricsMiddleware` does observe them — desirable) but `auth_n` short-circuits authentication and `auth_z` follows suit. + +## 6. Module structure + +All new code under `crates/service_utils`: + +```text +crates/service_utils/src/ + observability.rs -- pub use surface: init(), Observability, shutdown(), errors + observability/ + config.rs -- ObservabilityConfig parsed from env + meters.rs -- typed handles: HttpMeters, DbMeters, RedisMeters, RuntimeMeters + middleware.rs -- MetricsMiddleware (Transform + Service + InFlightGuard) + metrics_server.rs -- HttpServer on SUPERPOSITION_METRICS_PORT exposing /metrics + health.rs -- /healthz /livez /readyz handlers + saturation.rs -- spawn entry: register_saturation_observers(...) + saturation/ + db_pool.rs -- r2d2 ObservableGauge callbacks + redis_pool.rs -- fred ObservableGauge callbacks (cfg-gated on Redis configured) + tokio_runtime.rs -- cfg(tokio_unstable); 10s poll task + AtomicU64 → ObservableGauge +``` + +Files use the modern Rust 2018+ module layout (no `mod.rs`). `crates/service_utils/src/middlewares/` is left untouched and continues to use whatever pattern it currently uses. + +### 6.1 Public API sketch + +```rust +// observability.rs +pub struct Observability { /* meter_provider, registry, otlp_pipeline, shutdown_handles */ } + +impl Observability { + pub fn init(cfg: ObservabilityConfig) -> Result; + pub fn meter(&self) -> opentelemetry::metrics::Meter; + pub fn registry(&self) -> std::sync::Arc; + pub fn shutdown(self) -> Result<(), ObservabilityError>; +} + +pub fn metrics_middleware(meter: Meter, cfg: LabelConfig) -> middleware::MetricsMiddleware; + +pub fn spawn_metrics_server( + registry: std::sync::Arc, + bind: std::net::SocketAddr, +) -> std::io::Result; + +pub fn health_endpoints() -> actix_web::Scope; +pub fn health_endpoint_paths() -> &'static [&'static str]; // for auth_n exception set + +pub mod saturation { + pub fn register_observers( + meter: &Meter, + deps: SaturationDeps, + ) -> Result<(), ObservabilityError>; +} + +pub struct SaturationDeps { + pub db_pool: Option, + pub redis_client: Option, + pub tokio_collect_interval: std::time::Duration, +} +``` + +## 7. Dependencies + +Added to root `Cargo.toml` `[workspace.dependencies]` and enabled in `crates/service_utils/Cargo.toml`. Versions pinned to whatever is current and compatible at implementation time; the table below is the intent. + +| Crate | Approx version | Purpose | +|---|---|---| +| `opentelemetry` | 0.27 | API surface: `Meter`, `Counter`, `Histogram`, `UpDownCounter`, `ObservableGauge` | +| `opentelemetry_sdk` | 0.27 | SDK: `MeterProvider`, periodic readers, resource detection | +| `opentelemetry-prometheus` | 0.27 | Bridge OTel → `prometheus::Registry` for scrape exposition | +| `opentelemetry-otlp` | 0.27 | Optional OTLP HTTP/gRPC push exporter | +| `opentelemetry-semantic-conventions` | 0.27 | String constants for attributes (`HTTP_ROUTE`, etc.) | +| `prometheus` | 0.13 | Required by `opentelemetry-prometheus` for `Registry` and `TextEncoder` | +| `tokio-metrics` | 0.3 | Runtime metrics; gated by `cfg(tokio_unstable)` | + +`fred` already has its `metrics` feature available; we will enable it in `service_utils/Cargo.toml` at implementation time. + +## 8. Metric definitions + +All names follow OpenTelemetry semantic conventions where they exist; saturation metrics use OTel namespaces (`db.client.*`, `runtime.*`). The Prometheus exporter translates dots to underscores and appends `_seconds` to histograms with unit `s`, etc. + +### 8.1 HTTP — golden signals + +#### Latency, traffic, errors + +One histogram covers all three. Traffic is `rate(_count)`; errors are `rate(_count{status_code=~"5.."})`. No separate counter is needed. + +| Field | Value | +|---|---| +| Name | `http.server.request.duration` | +| Type | Histogram (f64, seconds) | +| Unit | `s` | +| Buckets (explicit) | `[0.005, 0.025, 0.1, 0.25, 0.5, 1, 2.5, 5, 10]` (9 + `+Inf` = 10 buckets) | +| Attributes | `http.request.method`, `http.response.status_code`, `http.route`, `sp.org_id`*, `sp.workspace_id`* | + +\* env-controlled, default on. Disable: `SUPERPOSITION_METRICS_LABEL_ORG=false`, `SUPERPOSITION_METRICS_LABEL_WORKSPACE=false`. + +**Bucket rationale.** Most p50/p95/p99 for a config-fetch service land in 5 ms – 500 ms; the 1 s / 2.5 s / 5 s / 10 s buckets exist to detect tail badness, not to give resolution there. Halving from OTel's 15-bucket default cuts series count nearly in half — a direct cardinality win. + +**Derived expressions** (PromQL/MetricsQL): + +```promql +# Traffic — requests/sec by route +sum(rate(http_server_request_duration_seconds_count[1m])) by (http_route) + +# Error rate — 5xx fraction by route +sum(rate(http_server_request_duration_seconds_count{http_response_status_code=~"5.."}[1m])) by (http_route) + / +sum(rate(http_server_request_duration_seconds_count[1m])) by (http_route) + +# Latency — p99 by route +histogram_quantile(0.99, + sum(rate(http_server_request_duration_seconds_bucket[1m])) by (le, http_route)) +``` + +#### Saturation — HTTP + +Two metrics, each capturing a different aspect: + +| Field | Value | +|---|---| +| Name | `http.server.busy.duration` | +| Type | Counter (f64, seconds) | +| Unit | `s` | +| Attributes | `http.request.method` | +| Semantics | On each completed request, add elapsed seconds. `rate(...)` over a window gives **time-averaged request concurrency** (Little's Law). Insensitive to scrape aliasing. | + +| Field | Value | +|---|---| +| Name | `http.server.active_requests` | +| Type | UpDownCounter | +| Attributes | `http.request.method` | +| Semantics | OTel semconv standard. Instantaneous value at scrape time. **Note:** for sub-100ms services this metric aliases badly; not the primary saturation signal. Kept for semconv compliance and dashboards that expect it. | + +`http.server.busy.duration` is the smooth, alert-safe saturation signal. `rate(http_server_busy_duration_seconds_total[1m])` is the average request concurrency over the last minute and can exceed worker count for I/O-bound work — that is expected, not a bug, because Tokio workers are not 1:1 with requests. + +### 8.2 DB pool saturation (`saturation::db_pool`) + +OTel `db.client.*` semantic conventions. Backed by `r2d2::Pool::state()` via observable callbacks — purely passive instrumentation, no changes at `pool.get()` call sites. + +| Name | Type | Attributes | Source | +|---|---|---|---| +| `db.client.connections.usage` | UpDownCounter (observable) | `state="idle"\|"used"`, `pool.name` | `state.idle_connections`; `state.connections - state.idle_connections` | +| `db.client.connections.max` | Gauge (observable) | `pool.name` | `pool.max_size()` | + +`pool.name` is `"primary"` initially; the API supports multiple pools later. + +**Not in v1** (deferred to follow-up): + +- `db.client.connection.wait.duration` (histogram) — would require timing every `pool.get()` invocation across the codebase. +- `db.client.connections.pending_requests` (gauge) — would require atomic-counter instrumentation at every `pool.get()` call site. + +Both become cheap once a typed pool wrapper exists (a single `App`-level helper that wraps `r2d2::Pool` and is the only way connections are obtained); that wrapper is a separate codebase change and is not in scope here. In v1, DB-pool saturation is signalled by `connections.usage` ratios — `connections.usage{state="used"} / connections.max` near 1.0 means saturation. The request-duration histogram tail will spike under DB starvation regardless. + +### 8.3 Redis pool saturation (`saturation::redis_pool`) + +Compiled out via `cfg` if Redis is not configured. Names mirror the DB pool. Backed by `fred`'s built-in metrics surface. + +| Name | Type | Attributes | +|---|---|---| +| `redis.client.connections.usage` | UpDownCounter (observable) | `state="idle"\|"used"`, `pool.name` | +| `redis.client.commands.in_flight` | Gauge (observable) | `pool.name` | +| `redis.client.command.latency` | Histogram (s) | `pool.name`, `command_kind="read"\|"write"\|"admin"` | + +Exact mapping from `fred` stats to these metrics is finalized at implementation time; if any field is unavailable, that metric is dropped from v1 with a TODO. + +### 8.4 Tokio runtime saturation (`saturation::tokio_runtime`) + +`#[cfg(tokio_unstable)]`-gated. Backed by `tokio_metrics::RuntimeMonitor`, polled every `SUPERPOSITION_METRICS_COLLECT_INTERVAL` (default 10 s) into `AtomicU64`s read by observable-gauge callbacks. + +| Name | Type | Attributes | Source | +|---|---|---|---| +| `runtime.tokio.workers` | Gauge (observable) | — | `num_workers` | +| `runtime.tokio.workers.busy_ratio` | Gauge (observable, f64) | — | `total_busy_duration / total_polls / interval` | +| `runtime.tokio.global_queue.depth` | Gauge (observable) | — | `global_queue_depth` | +| `runtime.tokio.tasks.alive` | Gauge (observable) | — | `live_tasks_count` if available; otherwise dropped | + +If a contributor builds without `--cfg tokio_unstable`, the module compiles to a no-op stub; everything else still works. + +### 8.5 Resource attributes + +Set once at `MeterProvider` init, applied to every metric. + +| Attribute | Source | +|---|---| +| `service.name` | `OTEL_SERVICE_NAME` env; default `"superposition"` | +| `service.version` | `env!("CARGO_PKG_VERSION")` at build time | +| `service.instance.id` | `SUPERPOSITION_INSTANCE_ID` env; default to hostname | +| `deployment.environment` | existing env detection (`PROD`/`SANDBOX`/`DEV`) | +| `OTEL_RESOURCE_ATTRIBUTES` | merged in if set (standard OTel env var) | + +## 9. Middleware mechanics + +### 9.1 Route template extraction + +Actix exposes `req.match_pattern() -> Option` returning the registered template (e.g., `/contexts/{context_id}`), not the raw URI. Three cases for `http.route`: + +| Match outcome | `http.route` value | +|---|---| +| Pattern matched | the pattern string | +| No route matched (404 from no match) | `__not_found__` | +| Static asset / Leptos frontend route | `__static__` | + +Sentinels are constants — finite set, bounded cardinality. + +`match_pattern()` is only populated after routing resolves. The middleware reads it in the response phase. The active-requests increment on entry uses `http.request.method` only, which is available immediately, so no ordering issue. + +### 9.2 Label extraction + +Read from request extensions during the response phase, set upstream by `OrgWorkspaceMiddlewareFactory`: + +```rust +let org_id = req.extensions().get::().map(|o| o.as_str().to_owned()); +let workspace = req.extensions().get::().map(|w| w.as_str().to_owned()); +``` + +For each: + +| Case | Action | +|---|---| +| Present | Emit attribute with the value. | +| Absent because route does not have one (e.g., org-management routes) | Omit the attribute. Series simply lacks that label — distinct from a value of `""`. | +| Absent because middleware short-circuited before setting it (401, 403) | Omit the attribute. | +| `LabelConfig` has the label disabled | Never emit, regardless of presence. | + +### 9.3 HTTP method normalization + +Per OTel HTTP semconv: known methods (`GET`, `POST`, `PUT`, `DELETE`, `PATCH`, `HEAD`, `OPTIONS`, `TRACE`, `CONNECT`) keep their literal value; anything else collapses to `_OTHER`. Implemented as a small match — no library dependency. Prevents weird clients (`XPROPFIND`, `INVALID-㊙️`) from blowing up cardinality. + +### 9.4 Status code source + +| Outcome | Status used | +|---|---| +| Normal response | `res.status().as_u16()` | +| Handler error converted by Actix | the converted response's status | +| Panic (caught by Actix's panic handler → 500) | `500`, with `error.type="panic"` set on the histogram observation only | + +### 9.5 Active-requests guard (panic-safe) + +```rust +struct InFlightGuard { + counter: UpDownCounter, + method_attr: KeyValue, + decremented: AtomicBool, +} + +impl Drop for InFlightGuard { + fn drop(&mut self) { + if !self.decremented.swap(true, Ordering::Relaxed) { + self.counter.add(-1, &[self.method_attr.clone()]); + } + } +} +``` + +On entry: increment, build guard, store in the request future. On normal completion: explicitly decrement (sets the flag). On client disconnect / future drop / panic upstream: `Drop` decrements as a fallback. The histogram is recorded only on normal completion — a half-finished request's latency is not meaningful. + +### 9.6 Endpoints excluded from instrumentation + +Hard-coded in v1 (configurable later): + +- `/metrics` — physically isolated on the metrics port; cannot reach the middleware. +- Static asset routes — emit `__static__` for `http.route` instead of being skipped, so a flood is still visible. +- `/healthz` `/livez` `/readyz` — instrumented (we want to observe them); auth bypass via `auth_n`'s existing path exception set. Their own latency contributes to `http.server.request.duration` under their own routes. + +### 9.7 Per-request overhead + +Expected: + +- ~3 hashmap lookups on `req.extensions()` +- 2 system clock reads (`Instant::now()` on entry/exit) +- 1 atomic increment + 1 atomic decrement on the active-requests gauge +- 1 histogram `record()` call (lock-free in OTel SDK 0.27+) +- 1 counter `add()` call for `http.server.busy.duration` + +**Hot-path allocations:** attribute *keys* are interned via `opentelemetry::Key::from_static_str`; attribute *values* (route, org, workspace) require `String` allocations because they are dynamic. This is unavoidable given Q3's label choices and is intrinsic to OTel attribute construction. + +Total expected overhead: **single-digit microseconds per request**, well below the millisecond scale of any handler. + +## 10. Saturation collector internals + +### 10.1 Pull-on-observation pattern + +OTel's `ObservableGauge` and `ObservableCounter` invoke a callback at collection time (every scrape, every push interval). For sources that are cheap to read synchronously (`r2d2::Pool::state()`, `fred` stats), no background task is needed: + +```rust +let pool_clone = pool.clone(); +meter + .u64_observable_gauge("db.client.connections.usage") + .with_callback(move |observer| { + let s = pool_clone.state(); + observer.observe(s.idle_connections as u64, + &[KeyValue::new("state", "idle"), + KeyValue::new("pool.name", "primary")]); + observer.observe((s.connections - s.idle_connections) as u64, + &[KeyValue::new("state", "used"), + KeyValue::new("pool.name", "primary")]); + }) + .init(); +``` + +### 10.2 Tokio-metrics polling exception + +`tokio_metrics::RuntimeMonitor::intervals()` is a delta iterator — it returns stats since the last call, not absolute values. This requires one `tokio::spawn` polling at `SUPERPOSITION_METRICS_COLLECT_INTERVAL` (default 10 s). The task writes derived values into `AtomicU64`s; observable-gauge callbacks read those atomics. Single background task in the whole observability subsystem. + +### 10.3 Build configuration + +Workspace `.cargo/config.toml`: + +```toml +[build] +rustflags = ["--cfg", "tokio_unstable"] +``` + +Without the flag, the `saturation::tokio_runtime` module compiles to a no-op stub; everything else still works. `README.md` and `makefile` get a one-line callout. `tokio_unstable` only enables additional Tokio APIs — no behavioural change for existing code. + +CI runs `cargo check` both with and without the flag to keep the no-op stub honest. + +## 11. Configuration surface + +All env-driven; no config file. Applies to the main `superposition` binary. + +| Var | Default | Purpose | +|---|---|---| +| `SUPERPOSITION_METRICS_ENABLED` | `true` | Master switch. `false` ⇒ no init, no middleware, no listener. | +| `SUPERPOSITION_METRICS_PORT` | `9091` | Port for the `/metrics` listener. | +| `SUPERPOSITION_METRICS_BIND` | `0.0.0.0` | Bind address for the metrics listener. Set to `127.0.0.1` for loopback-only. | +| `SUPERPOSITION_METRICS_LABEL_ORG` | `true` | Include `sp.org_id` attribute on HTTP metrics. | +| `SUPERPOSITION_METRICS_LABEL_WORKSPACE` | `true` | Include `sp.workspace_id` attribute on HTTP metrics. | +| `SUPERPOSITION_METRICS_COLLECT_INTERVAL` | `10s` | Tokio runtime metrics poll interval (only used if `tokio_unstable`). Parsed by `humantime`. | +| `SUPERPOSITION_INSTANCE_ID` | hostname | `service.instance.id` resource attribute. | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | unset | Standard OTel env var. If set, enables OTLP push exporter. | +| `OTEL_EXPORTER_OTLP_PROTOCOL` | `http/protobuf` | Standard OTel env var. | +| `OTEL_EXPORTER_OTLP_HEADERS` | unset | Standard OTel env var. | +| `OTEL_SERVICE_NAME` | `superposition` | Standard OTel env var. | +| `OTEL_RESOURCE_ATTRIBUTES` | unset | Standard OTel env var; merged into resource. | + +Env reading uses the existing `service_utils` env-loading idiom, matching what `auth_n` etc. already do. + +### 11.1 Cardinality budget (worked) + +For the HTTP request-duration histogram, per active workspace × org pair in steady state: + +- ~30 routes × ~3 methods used × ~5 status codes seen × 12 series-per-bucket-set = **~5,400 series ceiling**, with realized usage typically 10–20 % → **~540–1,080 actual series per workspace**. + +Other metrics (active_requests, busy_duration, saturation gauges) are method-only or unlabeled → ~30 series total, independent of tenant count. + +So adding a workspace ≈ 600–1,100 new series. At 1,000 active workspaces ≈ **600 k – 1.1 M series** for the HTTP histogram. Comfortably within `vmsingle` on 16 GB. + +If workspace count grows beyond ~5,000 active, set `SUPERPOSITION_METRICS_LABEL_WORKSPACE=false` and slice by workspace via traces instead — no code change required. + +## 12. Testing strategy + +### 12.1 Unit tests (`crates/service_utils/src/observability/`) + +| Test module | What it asserts | +|---|---| +| `middleware::tests::label_extraction` | Table-driven: request fixtures with various extension states → expected `Vec` produced. | +| `middleware::tests::method_normalization` | `XPROPFIND` → `_OTHER`; known methods pass through. | +| `middleware::tests::route_template_sentinels` | Unmatched path → `__not_found__`; static path → `__static__`. | +| `middleware::tests::active_requests_panic_safety` | Handler that panics still decrements the gauge via `Drop`. | +| `middleware::tests::label_config_disabled` | With `with_workspace_label=false`, attribute is not emitted even when present in extensions. | +| `config::tests::env_parsing` | Env-var combinations produce expected `ObservabilityConfig`. | + +### 12.2 Integration test (`crates/service_utils/tests/observability_integration.rs`) + +1. Boot a test app: `MetricsMiddleware` + a small `/test` scope with several routes + the metrics server on a random port. +2. Issue requests of varying methods, paths, status codes (including 404 to a non-route). +3. Scrape the metrics port; parse the Prometheus exposition with the `prometheus-parse` crate (or equivalent). +4. Assert: + - All expected metric names exist. + - `http_server_request_duration_seconds_count` per `(route, method, status)` matches the issued count. + - `__not_found__` route appears for the 404. + - `http_server_active_requests` returns to 0 after all requests complete. + - `http_server_busy_duration_seconds_total` is approximately `Σ request_duration` (within 10 %). +5. Smoke-test the OTLP pipeline against a mock OTLP receiver if cheap; otherwise gate behind `#[ignore]` and document. + +### 12.3 Cardinality regression test + +After §12.2 scenarios run, count distinct series in the exposition. Fail the test if total exceeds a budget (initial: 200 series for the test scenario). Catches accidental high-cardinality labels in code review. + +## 13. Rollout + +| Phase | Duration | Action | Exit criterion | +|---|---|---|---| +| **1 — code lands disabled** | 1 PR | Land code with `SUPERPOSITION_METRICS_ENABLED=false` as the *deployed* default in prod environments (override on in CI/staging). | Process startup time unchanged; per-request overhead within noise on existing locust suite; `/metrics` exposition parses cleanly in CI. | +| **2 — staging** | 48 h | `SUPERPOSITION_METRICS_ENABLED=true` in staging. | VM ingest rate stable; series count matches §11.1 estimate to within 30 %; no scrape errors. | +| **3 — prod, no workspace label** | 1 week | Prod on, `SUPERPOSITION_METRICS_LABEL_WORKSPACE=false`. | VM headroom > 30 %; alerts (when defined in follow-up PR) firing as expected. | +| **4 — prod, full** | — | `SUPERPOSITION_METRICS_LABEL_WORKSPACE=true`. | Steady-state. | + +Existing `info!(latency, "GoldenSignal")` log line at `crates/service_utils/src/middlewares/request_response_logging.rs:84` stays for now. Marked for removal once Phase 4 is steady and dashboards have migrated. + +## 14. Future work (not implemented) + +- **Trace correlation via exemplars.** When `tracing-opentelemetry` bridges traces into the same SDK, the histogram emits exemplars linking percentile spikes to specific traces. Free win once the bridge exists. +- **Per-tenant separate histogram** (option D from Q3 brainstorm). If the global histogram's cardinality budget proves tight, add a second `http_server_request_duration_by_workspace_seconds` with fewer buckets, retaining tenant slicing without paying the cost on the global histogram. +- **OTLP traces export.** The `Observability::init` shape is structured to host trace export later. +- **Grafana dashboards.** JSON dashboards under `grafana/dashboards/` covering the four golden-signal panels. Separate PR. +- **Alert rules.** VM/Prometheus alert rule YAML covering: error rate > X %, p99 latency > X ms, DB pool wait p99 > X ms, Tokio busy-ratio sustained > 0.8. Separate PR. +- **Per-route overhead controls.** A route-level allowlist/denylist in `LabelConfig` so noisy or high-volume internal routes can be sampled or excluded at runtime without redeploying. +- **DB pool wait visibility.** `db.client.connection.wait.duration` (histogram) and `db.client.connections.pending_requests` (gauge), unlocked by a typed pool wrapper that is the only way to obtain a connection. One-time codebase migration, then both metrics fall out for free. +- **Removing the existing `GoldenSignal` log line.** Once dashboards are migrated, the log line in `request_response_logging.rs:84` becomes redundant. + +## 15. Risks + +| Risk | Mitigation | +|---|---| +| OTel Rust SDK 0.27 has historically had churn between minor versions; metrics API was stabilized but exporter integrations may shift. | Pin to a single minor version; central import via `service_utils::observability`; bump in a single PR with the integration test as the gate. | +| `tokio_unstable` workspace flag affects all crates and may interact with future Tokio releases. | CI matrix runs `cargo check` with and without the flag. The `saturation::tokio_runtime` module is the only consumer; everything else compiles either way. | +| Workspace label cardinality grows unexpectedly (workspace creation rate, churn from short-lived workspaces). | `SUPERPOSITION_METRICS_LABEL_WORKSPACE=false` is a runtime opt-out; rollout Phase 3 lands with it off. | +| OTel attribute construction allocates `String` on the hot path. | Confirmed unavoidable for dynamic attribute values; benchmarked overhead expected single-digit microseconds. If profiling shows a problem, switch to `Cow<'static, str>` for attribute *values* where possible (e.g., method, status code) and keep allocations only for `route`/`org`/`workspace`. | +| `r2d2`'s waiter count and wait duration require call-site instrumentation; v1 has only `connections.usage` ratios. | Acceptable for v1: a usage ratio near `connections.max` signals saturation, and the request-duration histogram tail will spike under DB starvation. A typed pool wrapper in a follow-up unlocks both `wait.duration` and `pending_requests` cheaply. | +| `fred` metrics surface may not map 1:1 to OTel `db.client.*` style attributes. | Mapping is finalized at implementation time; any unavailable field is dropped from v1 with a TODO and noted in the PR description. | +| Health-check probes on the main port get instrumented and add noise to `http_server_request_duration_seconds`. | Acceptable: probe cardinality is fixed (3 routes × 1 method × 1 status), and observing probe latency is desirable. |