diff --git a/.gitignore b/.gitignore
index 018d16c1f..3184dc929 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,6 +41,7 @@ test_logs
 # pre-commit config
 .pre-commit-config.yaml
 .cargo
+!.cargo/config.toml
 
 #gradle files
 .gradle
diff --git a/Cargo.lock b/Cargo.lock
index 82a5d741f..081a72582 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5851,6 +5851,7 @@ dependencies = [
  "bytes",
  "http 1.1.0",
  "opentelemetry",
+ "reqwest 0.12.8",
 ]
 
 [[package]]
@@ -5867,6 +5868,7 @@ dependencies = [
  "opentelemetry-proto",
  "opentelemetry_sdk",
  "prost",
+ "reqwest 0.12.8",
  "serde_json",
  "thiserror 1.0.58",
  "tokio",
@@ -5874,6 +5876,19 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "opentelemetry-prometheus"
+version = "0.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b834e966ea5e2d03dfe5f2253f03d22cce21403ee940265070eeee96cee0bcc"
+dependencies = [
+ "once_cell",
+ "opentelemetry",
+ "opentelemetry_sdk",
+ "prometheus",
+ "protobuf",
+]
+
 [[package]]
 name = "opentelemetry-proto"
 version = "0.27.0"
@@ -6412,6 +6427,21 @@ dependencies = [
  "yansi",
 ]
 
+[[package]]
+name = "prometheus"
+version = "0.13.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1"
+dependencies = [
+ "cfg-if",
+ "fnv",
+ "lazy_static",
+ "memchr",
+ "parking_lot",
+ "protobuf",
+ "thiserror 1.0.58",
+]
+
 [[package]]
 name = "prost"
 version = "0.13.5"
@@ -6435,6 +6465,12 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "protobuf"
+version = "2.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
+
 [[package]]
 name = "psm"
 version = "0.1.30"
@@ -6469,7 +6505,7 @@ dependencies = [
  "quinn-udp",
  "rustc-hash 2.1.1",
  "rustls 0.23.28",
- "socket2 0.5.10",
+ "socket2 0.6.3",
  "thiserror 2.0.18",
  "tokio",
  "tracing",
@@ -6508,7 +6544,7 @@ dependencies = [
  "cfg_aliases",
  "libc",
  "once_cell",
- "socket2 0.5.10",
+ "socket2 0.6.3",
  "tracing",
  "windows-sys 0.60.2",
 ]
@@ -7007,7 +7043,7 @@ dependencies = [
  "errno",
  "libc",
  "linux-raw-sys 0.4.15",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]
 
 [[package]]
@@ -7664,11 +7700,17 @@ dependencies = [
  "diesel-adapter",
  "fred",
  "futures-util",
+ "humantime",
  "inventory",
  "juspay_diesel",
  "log",
  "once_cell",
  "openidconnect",
+ "opentelemetry",
+ "opentelemetry-otlp",
+ "opentelemetry-prometheus",
+ "opentelemetry_sdk",
+ "prometheus",
  "rand 0.8.5",
  "regex",
  "reqwest 0.11.27",
@@ -7680,6 +7722,7 @@ dependencies = [
  "superposition_derives",
  "superposition_macros",
  "superposition_types",
+ "thiserror 1.0.58",
  "tokio",
  "tracing",
  "tracing-actix-web",
@@ -7925,7 +7968,6 @@ dependencies = [
  "cfg-if",
  "libc",
  "psm",
- "windows-sys 0.52.0",
  "windows-sys 0.59.0",
 ]
 
@@ -8033,6 +8075,7 @@ dependencies = [
  "leptos",
  "leptos_actix",
  "log",
+ "opentelemetry",
  "regex",
  "reqwest 0.11.27",
  "rs-snowflake",
@@ -8042,6 +8085,7 @@ dependencies = [
  "superposition_derives",
  "superposition_macros",
  "superposition_types",
+ "tokio",
  "tracing",
  "tracing-actix-web",
  "tracing-subscriber",
diff --git a/Cargo.toml b/Cargo.toml
index 48423bf56..4e04bd08f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -63,6 +63,7 @@ diesel = { version = "2.2.4", package = "juspay_diesel", features = [
 ] }
 fred = { version = "9.2.1" }
 futures-util = "0.3.28"
+humantime = "2.1"
 inventory = "0.3"
 itertools = { version = "0.10.5" }
 jsonlogic = { version = "0.5.5", package = "juspay_jsonlogic" }
@@ -70,6 +71,11 @@ jsonschema = "~0.17"
 leptos = { version = "0.6.11" }
 log = { version = "0.4.20", features = ["kv_unstable_serde"] }
 once_cell = { version = "1.18.0" }
+opentelemetry = { version = "0.27", default-features = false, features = ["metrics"] }
+opentelemetry_sdk = { version = "0.27", default-features = false, features = ["metrics", "rt-tokio"] }
+opentelemetry-prometheus = { version = "0.27", default-features = false }
+opentelemetry-otlp = { version = "0.27", default-features = false, features = ["metrics", "http-proto", "reqwest-client"] }
+prometheus = { version = "0.13", default-features = false }
 regex = "1.9.1"
 reqwest = { version = "0.11.18", features = ["json"] }
 rs-snowflake = "0.6.0"
@@ -79,6 +85,7 @@ serde_json = { version = "1.0.140" }
 secrecy = "0.10"
 strum = "0.25"
 strum_macros = "0.25"
+thiserror = "1"
 tokio = { version = "1.29.1", features = ["full"] }
 toml = { version = "0.8.8", features = ["preserve_order"] }
 tracing = "0.1.44"
diff --git a/README.md b/README.md
index 01fb6de24..97a4ca457 100644
--- a/README.md
+++ b/README.md
@@ -150,6 +150,14 @@ Want a broader systems view? Open the [DeepWiki architecture guide](https://deep
 - [Context7 LLM-friendly docs](https://context7.com/juspay/superposition)
 - [DeepWiki repository guide](https://deepwiki.com/juspay/superposition)
 
+## Metrics & observability
+
+The HTTP API exposes Prometheus metrics on `SUPERPOSITION_METRICS_PORT` (default `9091`):
+
+```bash
+curl http://localhost:9091/metrics
+```
+
 ## Contributing
 
 We welcome contributions across the platform, clients, docs, and examples.
diff --git a/crates/experimentation_platform/src/api/experiment_groups/handlers.rs b/crates/experimentation_platform/src/api/experiment_groups/handlers.rs
index fd132adbf..6176bd38a 100644
--- a/crates/experimentation_platform/src/api/experiment_groups/handlers.rs
+++ b/crates/experimentation_platform/src/api/experiment_groups/handlers.rs
@@ -64,9 +64,9 @@ use crate::api::{
     experiments::{
         cac_api::validate_context,
         helpers::{
-            validate_change_reason_with_function, hash,
-            validate_and_add_experiment_group_id,
+            hash, validate_and_add_experiment_group_id,
             validate_and_remove_experiment_group_id,
+            validate_change_reason_with_function,
         },
     },
 };
diff --git a/crates/experimentation_platform/src/api/experiments/handlers.rs b/crates/experimentation_platform/src/api/experiments/handlers.rs
index ee62108e5..9c9aafb2b 100644
--- a/crates/experimentation_platform/src/api/experiments/handlers.rs
+++ b/crates/experimentation_platform/src/api/experiments/handlers.rs
@@ -89,9 +89,9 @@ use crate::api::{
     },
     experiments::{
         helpers::{
-            validate_change_reason_with_function,
             get_control_overrides_from_exp_id, put_experiments_in_redis,
-            validate_control_overrides, validate_delete_experiment_variants,
+            validate_change_reason_with_function, validate_control_overrides,
+            validate_delete_experiment_variants,
         },
         types::StartedByChangeSet,
     },
@@ -170,7 +170,7 @@ async fn create_handler(
         &workspace_context,
         &change_reason,
         &state,
-        &user
+        &user,
     )
     .await?;
 
@@ -462,7 +462,7 @@ async fn conclude_handler(
         &workspace_context,
         &req.change_reason,
         &state,
-        &user
+        &user,
     )
     .await?;
 
@@ -745,7 +745,7 @@ async fn discard_handler(
         &workspace_context,
         &req.change_reason,
         &state,
-        &user
+        &user,
     )
     .await?;
 
@@ -1359,7 +1359,7 @@ async fn ramp_handler(
         &workspace_context,
         &change_reason,
         &state,
-        &user
+        &user,
     )
     .await?;
 
@@ -1568,7 +1568,7 @@ async fn update_handler(
         &workspace_context,
         &change_reason,
         &state,
-        &user
+        &user,
     )
     .await?;
 
@@ -1907,7 +1907,7 @@ async fn pause_handler(
         &workspace_context,
         &req.change_reason,
         &state,
-        &user
+        &user,
     )
     .await?;
 
@@ -2004,7 +2004,7 @@ async fn resume_handler(
         &workspace_context,
         &req.change_reason,
         &state,
-        &user
+        &user,
     )
     .await?;
 
diff --git a/crates/experimentation_platform/src/api/experiments/helpers.rs b/crates/experimentation_platform/src/api/experiments/helpers.rs
index e964c2063..08e1537dc 100644
--- a/crates/experimentation_platform/src/api/experiments/helpers.rs
+++ b/crates/experimentation_platform/src/api/experiments/helpers.rs
@@ -790,7 +790,8 @@ pub async fn validate_change_reason_with_function(
         change_reason: change_reason.clone(),
     };
 
-    let headers_map = construct_header_map(workspace_context, vec![("x-user", user_str)])?;
+    let headers_map =
+        construct_header_map(workspace_context, vec![("x-user", user_str)])?;
 
     let response = http_client
         .post(&url)
diff --git a/crates/service_utils/Cargo.toml b/crates/service_utils/Cargo.toml
index ad00d2a55..6293e4065 100644
--- a/crates/service_utils/Cargo.toml
+++ b/crates/service_utils/Cargo.toml
@@ -21,10 +21,16 @@ diesel = { workspace = true }
 diesel-adapter = { version = "1.2.0" }
 fred = { workspace = true, features = ["metrics"] }
 futures-util = { workspace = true }
+humantime = { workspace = true }
 inventory = { workspace = true }
 log = { workspace = true }
 once_cell = { workspace = true }
 openidconnect = "3.5.0"
+opentelemetry = { workspace = true }
+opentelemetry_sdk = { workspace = true }
+opentelemetry-otlp = { workspace = true }
+opentelemetry-prometheus = { workspace = true }
+prometheus = { workspace = true }
 rand = "0.8"
 tokio = { workspace = true }
 tracing = { workspace = true }
@@ -42,6 +48,7 @@ superposition_types = { workspace = true, features = [
     "api",
     "diesel_derives",
 ] }
+thiserror = { workspace = true }
 url = { workspace = true }
 urlencoding = "~2.1.2"
 uuid = {workspace = true}
diff --git a/crates/service_utils/src/lib.rs b/crates/service_utils/src/lib.rs
index d8cf11600..686934474 100644
--- a/crates/service_utils/src/lib.rs
+++ b/crates/service_utils/src/lib.rs
@@ -1,10 +1,15 @@
 #![deny(unused_crate_dependencies)]
+// opentelemetry_otlp is only used in cfg(not(test)) code; suppress the lint
+// when compiling tests.
+#[cfg(test)]
+use opentelemetry_otlp as _;
 pub mod aws;
 pub mod db;
 pub mod encryption;
 pub mod extensions;
 pub mod helpers;
 pub mod middlewares;
+pub mod observability;
 pub mod redis;
 pub mod registry;
 pub mod service;
diff --git a/crates/service_utils/src/observability.rs b/crates/service_utils/src/observability.rs
new file mode 100644
index 000000000..3f2d72590
--- /dev/null
+++ b/crates/service_utils/src/observability.rs
@@ -0,0 +1,244 @@
+//! HTTP golden-signals metrics exposition via OpenTelemetry.
+//!
+//! See `docs/superpowers/specs/2026-05-10-otel-golden-signals-middleware-design.md`.
+
+mod config;
+mod meters;
+mod metrics_server;
+mod middleware;
+mod saturation;
+
+pub use config::{LabelConfig, ObservabilityConfig};
+pub use meters::HttpMeters;
+pub use metrics_server::spawn_metrics_server;
+pub use middleware::MetricsMiddleware;
+pub use saturation::{
+    DbPoolHandle, FredPoolStats, RedisHandle, RedisStats, SaturationDeps,
+    register_observers,
+};
+
+use std::sync::Arc;
+
+use opentelemetry::metrics::Meter;
+use opentelemetry_sdk::metrics::SdkMeterProvider;
+use prometheus::Registry;
+use thiserror::Error;
+
+#[derive(Debug, Error)]
+pub enum ObservabilityError {
+    #[error("prometheus exporter init failed: {0}")]
+    PrometheusInit(String),
+    #[error("otlp exporter init failed: {0}")]
+    OtlpInit(String),
+    #[error("config error: {0}")]
+    Config(String),
+    #[error("meter provider shutdown failed: {0}")]
+    Shutdown(String),
+    #[error(transparent)]
+    Io(#[from] std::io::Error),
+}
+
+pub struct Observability {
+    provider: SdkMeterProvider,
+    registry: Arc<Registry>,
+    meter: Meter,
+}
+
+impl Observability {
+    pub fn meter(&self) -> Meter {
+        self.meter.clone()
+    }
+
+    pub fn registry(&self) -> Arc<Registry> {
+        self.registry.clone()
+    }
+
+    pub fn shutdown(self) -> Result<(), ObservabilityError> {
+        self.provider
+            .shutdown()
+            .map_err(|e| ObservabilityError::Shutdown(e.to_string()))
+    }
+
+    pub fn init(cfg: ObservabilityConfig) -> Result<Self, ObservabilityError> {
+        use opentelemetry::KeyValue;
+        use opentelemetry_sdk::Resource;
+        use opentelemetry_sdk::metrics::SdkMeterProvider;
+
+        let registry = Arc::new(prometheus::Registry::new());
+
+        let exporter = opentelemetry_prometheus::exporter()
+            .with_registry((*registry).clone())
+            .without_scope_info()
+            .build()
+            .map_err(|e| ObservabilityError::PrometheusInit(e.to_string()))?;
+
+        let mut resource_attrs = vec![
+            KeyValue::new("service.name", cfg.service_name.clone()),
+            KeyValue::new("service.version", cfg.service_version.clone()),
+            KeyValue::new("service.instance.id", cfg.instance_id.clone()),
+        ];
+        if let Some(env) = &cfg.deployment_environment {
+            resource_attrs.push(KeyValue::new("deployment.environment", env.clone()));
+        }
+
+        // §8.5 — merge OTEL_RESOURCE_ATTRIBUTES ("k1=v1,k2=v2,...") if set.
+        // Keys and values are percent-encoded per W3C baggage / OTel spec; decode before use.
+        if let Ok(extra) = std::env::var("OTEL_RESOURCE_ATTRIBUTES") {
+            for pair in extra.split(',') {
+                if let Some((k, v)) = pair.split_once('=') {
+                    let k = urlencoding::decode(k.trim())
+                        .unwrap_or_else(|_| k.trim().into())
+                        .into_owned();
+                    let v = urlencoding::decode(v.trim())
+                        .unwrap_or_else(|_| v.trim().into())
+                        .into_owned();
+                    if !k.is_empty() {
+                        resource_attrs.push(KeyValue::new(k, v));
+                    }
+                }
+            }
+        }
+
+        let resource = Resource::new(resource_attrs);
+
+        let mut builder = SdkMeterProvider::builder()
+            .with_reader(exporter)
+            .with_resource(resource.clone());
+
+        if let Some(endpoint) = &cfg.otlp_endpoint {
+            match with_otlp_reader(builder, endpoint, cfg.collect_interval) {
+                Ok(b) => builder = b,
+                Err(e) => {
+                    tracing::warn!(
+                        error = %e,
+                        endpoint = %endpoint,
+                        "OTLP exporter init failed; metrics will be exposed via /metrics only",
+                    );
+                    // Rebuild Prom-only builder (base was consumed by with_otlp_reader).
+                    let prom_exporter = opentelemetry_prometheus::exporter()
+                        .with_registry((*registry).clone())
+                        .without_scope_info()
+                        .build()
+                        .map_err(|e| ObservabilityError::PrometheusInit(e.to_string()))?;
+                    builder = SdkMeterProvider::builder()
+                        .with_reader(prom_exporter)
+                        .with_resource(resource);
+                }
+            }
+        }
+
+        let provider = builder.build();
+        opentelemetry::global::set_meter_provider(provider.clone());
+        let meter = {
+            use opentelemetry::metrics::MeterProvider as _;
+            provider.meter("superposition")
+        };
+
+        Ok(Self {
+            provider,
+            registry,
+            meter,
+        })
+    }
+}
+
+#[cfg(not(test))]
+fn with_otlp_reader(
+    builder: opentelemetry_sdk::metrics::MeterProviderBuilder,
+    endpoint: &str,
+    interval: std::time::Duration,
+) -> Result<opentelemetry_sdk::metrics::MeterProviderBuilder, ObservabilityError> {
+    // Warn if the operator requested a protocol we do not support (gRPC).
+    // This binary is compiled with `http-proto` only; `grpc` silently falls
+    // back to HTTP, which can mask misconfiguration.
+    if let Ok(protocol) = std::env::var("OTEL_EXPORTER_OTLP_PROTOCOL") {
+        if !protocol.is_empty() && protocol != "http/protobuf" {
+            tracing::warn!(
+                requested_protocol = %protocol,
+                "OTEL_EXPORTER_OTLP_PROTOCOL set to '{}'; only 'http/protobuf' is supported in v1, using HTTP",
+                protocol
+            );
+        }
+    }
+
+    // Headers: the opentelemetry-otlp 0.27 HTTP exporter reads
+    // `OTEL_EXPORTER_OTLP_HEADERS` (and `OTEL_EXPORTER_OTLP_METRICS_HEADERS`)
+    // automatically during `build()` — no explicit wiring needed here.
+    use opentelemetry_otlp::{MetricExporter, WithExportConfig};
+    use opentelemetry_sdk::metrics::PeriodicReader;
+    use opentelemetry_sdk::runtime;
+
+    let exporter = MetricExporter::builder()
+        .with_http()
+        .with_endpoint(endpoint.to_owned())
+        .build()
+        .map_err(|e| ObservabilityError::OtlpInit(e.to_string()))?;
+
+    let reader = PeriodicReader::builder(exporter, runtime::Tokio)
+        .with_interval(interval)
+        .build();
+
+    Ok(builder.with_reader(reader))
+}
+
+#[cfg(test)]
+fn with_otlp_reader(
+    builder: opentelemetry_sdk::metrics::MeterProviderBuilder,
+    _endpoint: &str,
+    _interval: std::time::Duration,
+) -> Result<opentelemetry_sdk::metrics::MeterProviderBuilder, ObservabilityError> {
+    // OTLP exporter requires a tokio runtime; we don't spin one up in unit tests.
+    Ok(builder)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn test_cfg() -> ObservabilityConfig {
+        ObservabilityConfig {
+            enabled: true,
+            bind: "127.0.0.1".parse().unwrap(),
+            port: 0,
+            label: LabelConfig::default(),
+            collect_interval: std::time::Duration::from_secs(10),
+            instance_id: "test".into(),
+            service_name: "sp-test".into(),
+            service_version: "0.0.0-test".into(),
+            deployment_environment: None,
+            otlp_endpoint: None,
+        }
+    }
+
+    #[test]
+    fn init_builds_meter_and_registry() {
+        let obs = Observability::init(test_cfg()).expect("init failed");
+        let _meter = obs.meter();
+        let registry = obs.registry();
+        let families = registry.gather();
+        assert_eq!(
+            families.len(),
+            1,
+            "only target_info should be present before any instrument records"
+        );
+        assert_eq!(families[0].get_name(), "target_info");
+    }
+
+    #[test]
+    fn meter_can_record_a_histogram_and_register_it_in_registry() {
+        let obs = Observability::init(test_cfg()).unwrap();
+        let meter = obs.meter();
+        let h = meter.f64_histogram("test.duration").with_unit("s").build();
+        h.record(0.123, &[]);
+
+        let mut buf = Vec::new();
+        let encoder = prometheus::TextEncoder::new();
+        let metric_families = obs.registry().gather();
+        prometheus::Encoder::encode(&encoder, &metric_families, &mut buf).unwrap();
+        let text = String::from_utf8(buf).unwrap();
+        assert!(
+            text.contains("test_duration"),
+            "expected test_duration in exposition, got:\n{text}"
+        );
+    }
+}
diff --git a/crates/service_utils/src/observability/config.rs b/crates/service_utils/src/observability/config.rs
new file mode 100644
index 000000000..79bccfb40
--- /dev/null
+++ b/crates/service_utils/src/observability/config.rs
@@ -0,0 +1,194 @@
+//! Configuration for the observability subsystem, parsed from env vars.
+
+use std::{net::IpAddr, str::FromStr, time::Duration};
+
+#[derive(Debug, Clone)]
+pub struct ObservabilityConfig {
+    pub enabled: bool,
+    pub bind: IpAddr,
+    pub port: u16,
+    pub label: LabelConfig,
+    pub collect_interval: Duration,
+    pub instance_id: String,
+    pub service_name: String,
+    pub service_version: String,
+    pub deployment_environment: Option<String>,
+    pub otlp_endpoint: Option<String>,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct LabelConfig {
+    pub with_org_label: bool,
+    pub with_workspace_label: bool,
+}
+
+impl Default for LabelConfig {
+    fn default() -> Self {
+        Self {
+            with_org_label: true,
+            with_workspace_label: true,
+        }
+    }
+}
+
+/// Source of env-var values: `(key) -> Some(value) | None`.
+///
+/// Marker trait with a blanket impl over every closure / fn-pointer that
+/// matches the underlying `Fn` signature. Lets `from_source` and its
+/// helpers share one named bound instead of repeating
+/// `Fn(&str) -> Option<String>` at every site. Crate-private — the
+/// public config-loading API is `from_env`.
+pub(crate) trait EnvSource: Fn(&str) -> Option<String> {}
+impl<F: Fn(&str) -> Option<String>> EnvSource for F {}
+
+impl ObservabilityConfig {
+    /// Parse from the process environment via `std::env::var`.
+    pub fn from_env() -> Result<Self, String> {
+        Self::from_source(|k| std::env::var(k).ok())
+    }
+
+    /// Generic over the env source for testability.
+    ///
+    /// `get(key)` returns `Some(value)` when the key is set, `None` when absent.
+    /// This keeps tests pure (no process-global env mutations) and parallel-safe.
+    pub(crate) fn from_source<F: EnvSource>(get: F) -> Result<Self, String> {
+        /// Parse `key` via `T: FromStr`, falling back to `default_str` when
+        /// the key is absent. Folds the lookup, the default, and the error
+        /// label into one place so each env key appears exactly once at the
+        /// call site.
+        fn parse_or_default<T>(
+            get: &impl EnvSource,
+            key: &'static str,
+            default_str: &str,
+        ) -> Result<T, String>
+        where
+            T: FromStr,
+            T::Err: std::fmt::Display,
+        {
+            let raw = get(key).unwrap_or_else(|| default_str.to_owned());
+            T::from_str(&raw).map_err(|e| format!("{key}: {e}"))
+        }
+
+        fn get_opt(get: &impl EnvSource, key: &str) -> Option<String> {
+            get(key).filter(|s| !s.is_empty())
+        }
+
+        let enabled: bool =
+            parse_or_default(&get, "SUPERPOSITION_METRICS_ENABLED", "true")?;
+        let bind: IpAddr =
+            parse_or_default(&get, "SUPERPOSITION_METRICS_BIND", "0.0.0.0")?;
+        let port: u16 = parse_or_default(&get, "SUPERPOSITION_METRICS_PORT", "9091")?;
+        let with_org_label: bool =
+            parse_or_default(&get, "SUPERPOSITION_METRICS_LABEL_ORG", "true")?;
+        let with_workspace_label: bool =
+            parse_or_default(&get, "SUPERPOSITION_METRICS_LABEL_WORKSPACE", "true")?;
+        // humantime::Duration is a newtype around std::time::Duration that
+        // implements FromStr ("10s", "1m30s", "500ms"); convert back to the
+        // struct's std::time::Duration field after parsing.
+        let collect_interval: humantime::Duration =
+            parse_or_default(&get, "SUPERPOSITION_METRICS_COLLECT_INTERVAL", "10s")?;
+        let collect_interval: Duration = collect_interval.into();
+
+        // instance_id: env var takes precedence, then /etc/hostname, then "unknown".
+        let instance_id = get_opt(&get, "SUPERPOSITION_INSTANCE_ID")
+            .or_else(hostname_or_none)
+            .unwrap_or_else(|| "unknown".to_owned());
+
+        // service.name: OTEL standard env var. String: FromStr<Err = Infallible>
+        // means the `?` is a noop, but the call shape stays consistent with the
+        // parsed fields above.
+        let service_name: String =
+            parse_or_default(&get, "OTEL_SERVICE_NAME", "superposition")?;
+
+        // service.version: always the build-time crate version.
+        let service_version = env!("CARGO_PKG_VERSION").to_owned();
+
+        let deployment_environment =
+            get_opt(&get, "APP_ENV").or_else(|| get_opt(&get, "DEPLOYMENT_ENV"));
+
+        let otlp_endpoint = get_opt(&get, "OTEL_EXPORTER_OTLP_ENDPOINT");
+
+        Ok(Self {
+            enabled,
+            bind,
+            port,
+            label: LabelConfig {
+                with_org_label,
+                with_workspace_label,
+            },
+            collect_interval,
+            instance_id,
+            service_name,
+            service_version,
+            deployment_environment,
+            otlp_endpoint,
+        })
+    }
+}
+
+fn hostname_or_none() -> Option<String> {
+    // Avoid pulling in a hostname crate; read /etc/hostname on Linux/macOS.
+    std::fs::read_to_string("/etc/hostname")
+        .ok()
+        .map(|s| s.trim().to_owned())
+        .filter(|s| !s.is_empty())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::collections::HashMap;
+
+    fn lookup(map: HashMap<&str, &str>) -> impl Fn(&str) -> Option<String> {
+        let owned: HashMap<String, String> = map
+            .into_iter()
+            .map(|(k, v)| (k.to_owned(), v.to_owned()))
+            .collect();
+        move |k| owned.get(k).cloned()
+    }
+
+    #[test]
+    fn defaults_when_unset() {
+        let cfg = ObservabilityConfig::from_source(|_| None).unwrap();
+        assert!(cfg.enabled);
+        assert_eq!(cfg.port, 9091);
+        assert_eq!(cfg.bind.to_string(), "0.0.0.0");
+        assert!(cfg.label.with_org_label);
+        assert!(cfg.label.with_workspace_label);
+        assert_eq!(cfg.collect_interval, Duration::from_secs(10));
+        assert_eq!(cfg.service_name, "superposition");
+        assert_eq!(cfg.otlp_endpoint, None);
+    }
+
+    #[test]
+    fn explicit_overrides() {
+        let cfg = ObservabilityConfig::from_source(lookup(HashMap::from([
+            ("SUPERPOSITION_METRICS_ENABLED", "false"),
+            ("SUPERPOSITION_METRICS_PORT", "9999"),
+            ("SUPERPOSITION_METRICS_BIND", "127.0.0.1"),
+            ("SUPERPOSITION_METRICS_LABEL_WORKSPACE", "false"),
+            ("SUPERPOSITION_METRICS_COLLECT_INTERVAL", "30s"),
+            ("OTEL_EXPORTER_OTLP_ENDPOINT", "http://collector:4318"),
+            ("OTEL_SERVICE_NAME", "sp-test"),
+        ])))
+        .unwrap();
+        assert!(!cfg.enabled);
+        assert_eq!(cfg.port, 9999);
+        assert_eq!(cfg.bind.to_string(), "127.0.0.1");
+        assert!(cfg.label.with_org_label); // default still true
+        assert!(!cfg.label.with_workspace_label);
+        assert_eq!(cfg.collect_interval, Duration::from_secs(30));
+        assert_eq!(cfg.otlp_endpoint.as_deref(), Some("http://collector:4318"));
+        assert_eq!(cfg.service_name, "sp-test");
+    }
+
+    #[test]
+    fn malformed_port_errors() {
+        let err = ObservabilityConfig::from_source(lookup(HashMap::from([(
+            "SUPERPOSITION_METRICS_PORT",
+            "not-a-number",
+        )])))
+        .unwrap_err();
+        assert!(err.contains("SUPERPOSITION_METRICS_PORT"));
+    }
+}
diff --git a/crates/service_utils/src/observability/meters.rs b/crates/service_utils/src/observability/meters.rs
new file mode 100644
index 000000000..e5a8713bf
--- /dev/null
+++ b/crates/service_utils/src/observability/meters.rs
@@ -0,0 +1,44 @@
+//! Typed handles for the metric instruments emitted by the HTTP middleware.
+
+use opentelemetry::metrics::{Counter, Histogram, Meter, UpDownCounter};
+
+/// Histogram + counter + gauge for HTTP server golden signals. Built once at
+/// startup and cloned cheaply; instruments are `Arc<>`-backed internally.
+#[derive(Clone)]
+pub struct HttpMeters {
+    pub request_duration: Histogram<f64>,
+    pub busy_duration: Counter<f64>,
+    pub active_requests: UpDownCounter<i64>,
+}
+
+impl HttpMeters {
+    pub fn new(meter: &Meter) -> Self {
+        let request_duration = meter
+            .f64_histogram("http.server.request.duration")
+            .with_unit("s")
+            .with_description("Duration of HTTP server requests, in seconds.")
+            .with_boundaries(vec![0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0])
+            .build();
+
+        let busy_duration = meter
+            .f64_counter("http.server.busy.duration")
+            .with_unit("s")
+            .with_description(
+                "Cumulative seconds spent serving HTTP requests; \
+                 rate() over a window gives time-averaged request concurrency.",
+            )
+            .build();
+
+        let active_requests = meter
+            .i64_up_down_counter("http.server.active_requests")
+            .with_unit("{request}")
+            .with_description("Number of HTTP server requests currently in flight.")
+            .build();
+
+        Self {
+            request_duration,
+            busy_duration,
+            active_requests,
+        }
+    }
+}
diff --git a/crates/service_utils/src/observability/metrics_server.rs b/crates/service_utils/src/observability/metrics_server.rs
new file mode 100644
index 000000000..5ad3c3c84
--- /dev/null
+++ b/crates/service_utils/src/observability/metrics_server.rs
@@ -0,0 +1,63 @@
+//! Separate HttpServer that exposes /metrics on SUPERPOSITION_METRICS_PORT.
+
+use std::{net::SocketAddr, sync::Arc};
+
+use actix_web::{App, HttpResponse, HttpServer, dev::Server, web};
+use prometheus::{Encoder, Registry, TextEncoder};
+
+/// Spawn an HttpServer on `bind` whose only route is `GET /metrics`. Returns
+/// the actix `Server` handle so the caller can `await` it concurrently with
+/// the main app.
+pub fn spawn_metrics_server(
+    registry: Arc<Registry>,
+    bind: SocketAddr,
+) -> std::io::Result<Server> {
+    let registry_data = web::Data::new(registry);
+    Ok(HttpServer::new(move || {
+        App::new()
+            .app_data(registry_data.clone())
+            .route("/metrics", web::get().to(scrape))
+    })
+    .workers(1)
+    .bind(bind)?
+    .run())
+}
+
+async fn scrape(registry: web::Data<Arc<Registry>>) -> HttpResponse {
+    let encoder = TextEncoder::new();
+    let metric_families = registry.gather();
+    let mut buf = Vec::new();
+    if let Err(e) = encoder.encode(&metric_families, &mut buf) {
+        return HttpResponse::InternalServerError().body(format!("encode error: {e}"));
+    }
+    HttpResponse::Ok()
+        .content_type(encoder.format_type())
+        .body(buf)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use actix_web::{App, http::StatusCode, test};
+
+    #[actix_web::test]
+    async fn scrape_endpoint_returns_text_plain() {
+        let registry = Arc::new(Registry::new());
+        let app = test::init_service(
+            App::new()
+                .app_data(web::Data::new(registry.clone()))
+                .route("/metrics", web::get().to(scrape)),
+        )
+        .await;
+        let req = test::TestRequest::get().uri("/metrics").to_request();
+        let resp = test::call_service(&app, req).await;
+        assert_eq!(resp.status(), StatusCode::OK);
+        let ct = resp
+            .headers()
+            .get("content-type")
+            .unwrap()
+            .to_str()
+            .unwrap();
+        assert!(ct.starts_with("text/plain"), "got {ct}");
+    }
+}
diff --git a/crates/service_utils/src/observability/middleware.rs b/crates/service_utils/src/observability/middleware.rs
new file mode 100644
index 000000000..11b76e44f
--- /dev/null
+++ b/crates/service_utils/src/observability/middleware.rs
@@ -0,0 +1,457 @@
+//! Actix middleware that records OpenTelemetry HTTP server metrics.
+
+use std::future::{Ready, ready};
+use std::rc::Rc;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::time::Instant;
+
+use actix_web::{
+    Error, HttpMessage,
+    body::MessageBody,
+    dev::{Service, ServiceRequest, ServiceResponse, Transform, forward_ready},
+};
+use futures_util::future::LocalBoxFuture;
+use opentelemetry::KeyValue;
+use opentelemetry::metrics::{Meter, UpDownCounter};
+
+use crate::observability::config::LabelConfig;
+use crate::observability::meters::HttpMeters;
+use crate::service::types::{OrganisationId, WorkspaceId};
+
+/// Per OpenTelemetry HTTP semantic conventions, only known methods get their
+/// literal name; anything else collapses to `_OTHER`. Prevents weirdo clients
+/// from blowing up the cardinality of the `http.request.method` attribute.
+pub(crate) fn normalize_method(m: &actix_web::http::Method) -> &'static str {
+    macro_rules! match_known {
+        ($val:expr, [$($name:literal),+ $(,)?], $other:literal) => {
+            match $val { $($name => $name,)+ _ => $other }
+        };
+    }
+    match_known!(
+        m.as_str(),
+        ["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "TRACE", "CONNECT"],
+        "_OTHER"
+    )
+}
+
+/// Sentinel for paths that did not match any registered route (would 404).
+pub(crate) const ROUTE_NOT_FOUND: &str = "__not_found__";
+
+/// Sentinel for static-asset routes (pkg, assets, favicon).  Collapsing these
+/// prevents one cardinality series per unique path tail.
+pub(crate) const ROUTE_STATIC: &str = "__static__";
+
+/// Route-pattern prefixes that identify static-asset serving routes.
+/// Any `match_pattern()` that starts with one of these is collapsed to
+/// `ROUTE_STATIC` to keep `http.route` cardinality bounded.
+const STATIC_PATTERN_PREFIXES: &[&str] = &["/pkg", "/assets", "/favicon"];
+
+/// Returns `true` when `pattern` belongs to a static-asset route.
+pub(crate) fn is_static_pattern(pattern: &str) -> bool {
+    STATIC_PATTERN_PREFIXES
+        .iter()
+        .any(|prefix| pattern.starts_with(prefix))
+}
+
+/// Extracts the templated route pattern from a `ServiceRequest`.
+/// Falls back to `ROUTE_NOT_FOUND` when no route matched;
+/// collapses static patterns to `ROUTE_STATIC`.
+///
+/// This is the request-phase variant.  In production the middleware uses
+/// [`extract_route_from_response`] (response phase) because route matching is
+/// only complete after the inner service has run.  This function is available
+/// for callers that have a live `ServiceRequest` (e.g. future request-scoped
+/// middleware).
+#[allow(dead_code)]
+pub(crate) fn extract_route(req: &ServiceRequest) -> String {
+    match req.match_pattern() {
+        None => ROUTE_NOT_FOUND.to_owned(),
+        Some(p) if is_static_pattern(&p) => ROUTE_STATIC.to_owned(),
+        Some(p) => p,
+    }
+}
+
+/// Same logic as `extract_route` but operates on a completed `ServiceResponse`
+/// (available in the middleware's response phase).
+pub(crate) fn extract_route_from_response<B>(res: &ServiceResponse<B>) -> String {
+    match res.request().match_pattern() {
+        None => ROUTE_NOT_FOUND.to_owned(),
+        Some(p) if is_static_pattern(&p) => ROUTE_STATIC.to_owned(),
+        Some(p) => p,
+    }
+}
+
+/// Build the OTel attributes set for a single HTTP request. Reads org_id /
+/// workspace_id from request extensions if `OrgWorkspaceMiddlewareFactory`
+/// has populated them; otherwise omits those attributes entirely (rather
+/// than emitting an empty string, which would create a distinct series).
+pub(crate) fn build_attributes(
+    method: &'static str,
+    route: &str,
+    status_code: u16,
+    org_id: Option<&str>,
+    workspace: Option<&str>,
+    label_cfg: &LabelConfig,
+) -> Vec<KeyValue> {
+    let mut attrs = Vec::with_capacity(5);
+    attrs.push(KeyValue::new("http.request.method", method));
+    attrs.push(KeyValue::new("http.route", route.to_owned()));
+    attrs.push(KeyValue::new(
+        "http.response.status_code",
+        status_code as i64,
+    ));
+    if label_cfg.with_org_label {
+        if let Some(o) = org_id {
+            attrs.push(KeyValue::new("sp.org_id", o.to_owned()));
+        }
+    }
+    if label_cfg.with_workspace_label {
+        if let Some(w) = workspace {
+            attrs.push(KeyValue::new("sp.workspace_id", w.to_owned()));
+        }
+    }
+    attrs
+}
+
+/// RAII guard that decrements `http.server.active_requests` on Drop unless
+/// `release()` was called. Ensures a panicking handler still decrements the
+/// gauge.
+#[must_use = "dropping InFlightGuard immediately negates the in-flight window"]
+pub(crate) struct InFlightGuard {
+    counter: UpDownCounter<i64>,
+    method: &'static str,
+    decremented: AtomicBool,
+}
+
+impl InFlightGuard {
+    pub(crate) fn enter(counter: UpDownCounter<i64>, method: &'static str) -> Self {
+        counter.add(1, &[KeyValue::new("http.request.method", method)]);
+        Self {
+            counter,
+            method,
+            decremented: AtomicBool::new(false),
+        }
+    }
+
+    pub(crate) fn release(&self) {
+        if !self.decremented.swap(true, Ordering::Relaxed) {
+            self.counter
+                .add(-1, &[KeyValue::new("http.request.method", self.method)]);
+        }
+    }
+}
+
+impl Drop for InFlightGuard {
+    fn drop(&mut self) {
+        self.release();
+    }
+}
+
+#[derive(Clone)]
+pub struct MetricsMiddleware {
+    meters: HttpMeters,
+    label_cfg: LabelConfig,
+}
+
+impl MetricsMiddleware {
+    pub fn new(meter: &Meter, label_cfg: LabelConfig) -> Self {
+        Self {
+            meters: HttpMeters::new(meter),
+            label_cfg,
+        }
+    }
+}
+
+impl<S, B> Transform<S, ServiceRequest> for MetricsMiddleware
+where
+    S: Service<ServiceRequest, Response = ServiceResponse<B>, Error = Error> + 'static,
+    S::Future: 'static,
+    B: MessageBody + 'static,
+{
+    type Response = ServiceResponse<B>;
+    type Error = Error;
+    type InitError = ();
+    type Transform = MetricsMiddlewareImpl<S>;
+    type Future = Ready<Result<Self::Transform, Self::InitError>>;
+
+    fn new_transform(&self, service: S) -> Self::Future {
+        ready(Ok(MetricsMiddlewareImpl {
+            service: Rc::new(service),
+            meters: self.meters.clone(),
+            label_cfg: self.label_cfg,
+        }))
+    }
+}
+
+pub struct MetricsMiddlewareImpl<S> {
+    service: Rc<S>,
+    meters: HttpMeters,
+    label_cfg: LabelConfig,
+}
+
+impl<S, B> Service<ServiceRequest> for MetricsMiddlewareImpl<S>
+where
+    S: Service<ServiceRequest, Response = ServiceResponse<B>, Error = Error> + 'static,
+    S::Future: 'static,
+    B: MessageBody + 'static,
+{
+    type Response = ServiceResponse<B>;
+    type Error = Error;
+    type Future = LocalBoxFuture<'static, Result<Self::Response, Self::Error>>;
+
+    forward_ready!(service);
+
+    fn call(&self, req: ServiceRequest) -> Self::Future {
+        let service = self.service.clone();
+        let meters = self.meters.clone();
+        let label_cfg = self.label_cfg;
+
+        let method_normalized = normalize_method(req.method());
+        let start = Instant::now();
+        let guard =
+            InFlightGuard::enter(meters.active_requests.clone(), method_normalized);
+
+        Box::pin(async move {
+            let result = service.call(req).await;
+            let elapsed = start.elapsed().as_secs_f64();
+
+            match &result {
+                Ok(res) => {
+                    let route = extract_route_from_response(res);
+                    let status = res.status().as_u16();
+                    let extensions = res.request().extensions();
+                    let org = extensions.get::<OrganisationId>().map(|o| o.0.clone());
+                    let ws = extensions.get::<WorkspaceId>().map(|w| w.0.clone());
+                    drop(extensions);
+
+                    let attrs = build_attributes(
+                        method_normalized,
+                        &route,
+                        status,
+                        org.as_deref(),
+                        ws.as_deref(),
+                        &label_cfg,
+                    );
+                    meters.request_duration.record(elapsed, &attrs);
+                    meters.busy_duration.add(
+                        elapsed,
+                        &[KeyValue::new("http.request.method", method_normalized)],
+                    );
+                }
+                Err(err) => {
+                    // The request was consumed by `service.call`, so
+                    // `match_pattern` is no longer accessible.  Route stays
+                    // `ROUTE_NOT_FOUND`.  We do extract the real HTTP status
+                    // from the error's response rather than blindly using 500.
+                    let status = err.error_response().status().as_u16();
+                    let attrs = build_attributes(
+                        method_normalized,
+                        ROUTE_NOT_FOUND,
+                        status,
+                        None,
+                        None,
+                        &label_cfg,
+                    );
+                    meters.request_duration.record(elapsed, &attrs);
+                    // The request still consumed worker time; count it.
+                    meters.busy_duration.add(
+                        elapsed,
+                        &[KeyValue::new("http.request.method", method_normalized)],
+                    );
+                }
+            }
+
+            guard.release();
+            result
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use actix_web::http::Method;
+
+    #[test]
+    fn known_methods_pass_through() {
+        for (m, expected) in [
+            (Method::GET, "GET"),
+            (Method::POST, "POST"),
+            (Method::PUT, "PUT"),
+            (Method::DELETE, "DELETE"),
+            (Method::PATCH, "PATCH"),
+            (Method::HEAD, "HEAD"),
+            (Method::OPTIONS, "OPTIONS"),
+            (Method::TRACE, "TRACE"),
+            (Method::CONNECT, "CONNECT"),
+        ] {
+            assert_eq!(normalize_method(&m), expected);
+        }
+    }
+
+    #[test]
+    fn unknown_methods_collapse_to_other() {
+        let m = Method::from_bytes(b"XPROPFIND").unwrap();
+        assert_eq!(normalize_method(&m), "_OTHER");
+        let m = Method::from_bytes(b"WEIRDO").unwrap();
+        assert_eq!(normalize_method(&m), "_OTHER");
+    }
+
+    use actix_web::{App, HttpResponse, http::StatusCode, test as actix_test, web};
+
+    #[test]
+    fn extract_route_helper_handles_static_paths() {
+        assert!(is_static_pattern("/pkg/{tail:.*}"));
+        assert!(is_static_pattern("/assets/{tail:.*}"));
+        assert!(is_static_pattern("/favicon.ico"));
+        assert!(!is_static_pattern("/contexts/{id}"));
+        assert!(!is_static_pattern("/health"));
+    }
+
+    #[actix_web::test]
+    async fn matched_route_setup_smoke() {
+        let app = actix_test::init_service(App::new().route(
+            "/contexts/{id}",
+            web::get().to(|| async { HttpResponse::Ok().finish() }),
+        ))
+        .await;
+        let req = actix_test::TestRequest::get()
+            .uri("/contexts/abc123")
+            .to_request();
+        let resp = actix_test::call_service(&app, req).await;
+        assert_eq!(resp.status(), StatusCode::OK);
+        // Note: extract_route is exercised in the integration test in Task 19
+        // because match_pattern() is only populated mid-pipeline. This unit-test
+        // stub is kept for build-coverage of the call site.
+    }
+
+    use crate::observability::config::LabelConfig;
+
+    #[test]
+    fn build_attributes_with_all_labels() {
+        let cfg = LabelConfig {
+            with_org_label: true,
+            with_workspace_label: true,
+        };
+        let attrs = build_attributes(
+            "GET",
+            "/contexts/{id}",
+            200,
+            Some("org1"),
+            Some("ws1"),
+            &cfg,
+        );
+        assert_eq!(attrs.len(), 5);
+        assert!(attrs.iter().any(|kv| kv.key.as_str() == "sp.org_id"));
+        assert!(attrs.iter().any(|kv| kv.key.as_str() == "sp.workspace_id"));
+    }
+
+    #[test]
+    fn build_attributes_omits_missing_workspace() {
+        let cfg = LabelConfig {
+            with_org_label: true,
+            with_workspace_label: true,
+        };
+        let attrs = build_attributes("POST", "/orgs", 201, Some("org1"), None, &cfg);
+        assert_eq!(attrs.len(), 4);
+        assert!(attrs.iter().any(|kv| kv.key.as_str() == "sp.org_id"));
+        assert!(!attrs.iter().any(|kv| kv.key.as_str() == "sp.workspace_id"));
+    }
+
+    #[test]
+    fn build_attributes_respects_disable_flag() {
+        let cfg = LabelConfig {
+            with_org_label: false,
+            with_workspace_label: false,
+        };
+        let attrs = build_attributes("GET", "/x", 200, Some("org1"), Some("ws1"), &cfg);
+        assert_eq!(attrs.len(), 3);
+        assert!(!attrs.iter().any(|kv| kv.key.as_str() == "sp.org_id"));
+        assert!(!attrs.iter().any(|kv| kv.key.as_str() == "sp.workspace_id"));
+    }
+
+    #[test]
+    fn guard_decrements_on_drop_only_once() {
+        use crate::observability::{LabelConfig, Observability, ObservabilityConfig};
+        use std::time::Duration;
+
+        let cfg = ObservabilityConfig {
+            enabled: true,
+            bind: "127.0.0.1".parse().unwrap(),
+            port: 0,
+            label: LabelConfig::default(),
+            collect_interval: Duration::from_secs(10),
+            instance_id: "test".into(),
+            service_name: "sp-test".into(),
+            service_version: "0".into(),
+            deployment_environment: None,
+            otlp_endpoint: None,
+        };
+        let obs = Observability::init(cfg).unwrap();
+        let m = obs.meter().i64_up_down_counter("test.in_flight").build();
+
+        {
+            let g = InFlightGuard::enter(m.clone(), "GET");
+            g.release();
+            // Drop after explicit release; should be a no-op.
+        }
+        // The guard should tolerate multiple release() calls without panicking
+        // and a release-then-drop pattern.
+        let g = InFlightGuard::enter(m.clone(), "POST");
+        g.release();
+        g.release();
+        drop(g);
+    }
+
+    #[actix_web::test]
+    async fn middleware_records_request_duration() {
+        use crate::observability::{Observability, ObservabilityConfig};
+        use std::time::Duration;
+
+        let cfg = ObservabilityConfig {
+            enabled: true,
+            bind: "127.0.0.1".parse().unwrap(),
+            port: 0,
+            label: LabelConfig::default(),
+            collect_interval: Duration::from_secs(10),
+            instance_id: "test".into(),
+            service_name: "sp-test".into(),
+            service_version: "0".into(),
+            deployment_environment: None,
+            otlp_endpoint: None,
+        };
+        let obs = Observability::init(cfg).unwrap();
+        let mw = MetricsMiddleware::new(&obs.meter(), LabelConfig::default());
+
+        use actix_web::{App, HttpResponse, http::StatusCode, web};
+        let app = actix_test::init_service(App::new().wrap(mw).route(
+            "/ping",
+            web::get().to(|| async { HttpResponse::Ok().body("pong") }),
+        ))
+        .await;
+
+        let req = actix_test::TestRequest::get().uri("/ping").to_request();
+        let resp = actix_test::call_service(&app, req).await;
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        let mut buf = Vec::new();
+        let metric_families = obs.registry().gather();
+        prometheus::Encoder::encode(
+            &prometheus::TextEncoder::new(),
+            &metric_families,
+            &mut buf,
+        )
+        .unwrap();
+        let text = String::from_utf8(buf).unwrap();
+        assert!(
+            text.contains("http_server_request_duration_seconds_count"),
+            "{text}"
+        );
+        assert!(
+            text.contains("http_server_busy_duration_seconds_total"),
+            "{text}"
+        );
+        assert!(text.contains("http_server_active_requests"), "{text}");
+        assert!(text.contains("http_route=\"/ping\""), "{text}");
+    }
+}
diff --git a/crates/service_utils/src/observability/saturation.rs b/crates/service_utils/src/observability/saturation.rs
new file mode 100644
index 000000000..9255e6e31
--- /dev/null
+++ b/crates/service_utils/src/observability/saturation.rs
@@ -0,0 +1,34 @@
+//! Saturation collectors: DB pool, Redis pool, Tokio runtime.
+//!
+//! All metrics are observable-instrument callbacks — no background tasks.
+
+mod db_pool;
+mod redis_pool;
+mod tokio_runtime;
+
+use opentelemetry::metrics::Meter;
+
+pub use db_pool::DbPoolHandle;
+pub use redis_pool::{FredPoolStats, RedisHandle, RedisStats};
+
+/// Optional dependencies the saturation subsystem can observe.
+#[derive(Default, Clone)]
+pub struct SaturationDeps {
+    pub db_pool: Option<DbPoolHandle>,
+    pub redis_client: Option<redis_pool::RedisHandle>,
+}
+
+pub fn register_observers(
+    meter: &Meter,
+    deps: SaturationDeps,
+) -> Result<(), super::ObservabilityError> {
+    if let Some(pool) = deps.db_pool {
+        db_pool::register(meter, pool, "primary");
+    }
+    if let Some(client) = deps.redis_client {
+        redis_pool::register(meter, client, "primary");
+    }
+    tokio_runtime::register(meter);
+
+    Ok(())
+}
diff --git a/crates/service_utils/src/observability/saturation/db_pool.rs b/crates/service_utils/src/observability/saturation/db_pool.rs
new file mode 100644
index 000000000..c3201b86b
--- /dev/null
+++ b/crates/service_utils/src/observability/saturation/db_pool.rs
@@ -0,0 +1,50 @@
+//! ObservableGauge callbacks for the r2d2 connection pool. Purely passive —
+//! no instrumentation at `pool.get()` call sites.
+
+use std::sync::Arc;
+
+use opentelemetry::{KeyValue, metrics::Meter};
+
+/// Concrete pool type used across the codebase.
+///
+/// Mirrors `crate::db::PgSchemaConnectionPool` (which aliases
+/// `diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<diesel::PgConnection>>`).
+/// Using an explicit expansion here so the observability subsystem does not
+/// take a hard dep on `crate::db` — callers pass the handle in via
+/// `SaturationDeps`.
+pub type DbPoolHandle =
+    Arc<diesel::r2d2::Pool<diesel::r2d2::ConnectionManager<diesel::PgConnection>>>;
+
+pub fn register(meter: &Meter, pool: DbPoolHandle, pool_name: &'static str) {
+    let pool_for_usage = pool.clone();
+    let usage_pool_name = KeyValue::new("pool.name", pool_name);
+    meter
+        .u64_observable_gauge("db.client.connections.usage")
+        .with_description("Number of DB connections in idle/used state.")
+        .with_callback(move |observer| {
+            let s = pool_for_usage.state();
+            let used = s.connections.saturating_sub(s.idle_connections);
+            observer.observe(
+                s.idle_connections as u64,
+                &[KeyValue::new("state", "idle"), usage_pool_name.clone()],
+            );
+            observer.observe(
+                used as u64,
+                &[KeyValue::new("state", "used"), usage_pool_name.clone()],
+            );
+        })
+        .build();
+
+    let pool_for_max = pool.clone();
+    let max_pool_name = KeyValue::new("pool.name", pool_name);
+    meter
+        .u64_observable_gauge("db.client.connections.max")
+        .with_description("Configured maximum size of the DB connection pool.")
+        .with_callback(move |observer| {
+            observer.observe(
+                pool_for_max.max_size() as u64,
+                std::slice::from_ref(&max_pool_name),
+            );
+        })
+        .build();
+}
diff --git a/crates/service_utils/src/observability/saturation/redis_pool.rs b/crates/service_utils/src/observability/saturation/redis_pool.rs
new file mode 100644
index 000000000..a0576e3b6
--- /dev/null
+++ b/crates/service_utils/src/observability/saturation/redis_pool.rs
@@ -0,0 +1,91 @@
+//! Saturation gauges for the Redis client pool (fred crate).
+//!
+//! fred's `metrics` feature exposes per-client / per-pool stats. The
+//! callbacks below are intentionally tolerant: if a stat is unavailable
+//! in the version we use, the metric is simply not emitted.
+//!
+//! ## fred 9.2.1 API notes
+//!
+//! * `ClientLike::is_connected()` returns true when the client's underlying
+//!   connection to Redis is active. Counting these across the pool gives a
+//!   useful "healthy connections" gauge.
+//! * `MetricsInterface` is implemented on `RedisClient`, **not** on
+//!   `RedisPool`. The pool exposes `.clients() -> &[RedisClient]` so we can
+//!   iterate over individual clients.
+//! * `command_queue_len()` (via `MetricsInterface`) counts buffered commands
+//!   waiting to be written to the socket. Summing across pool clients gives a
+//!   useful "pending work" gauge.
+
+use std::sync::Arc;
+
+use fred::{
+    interfaces::{ClientLike, MetricsInterface},
+    prelude::RedisPool,
+};
+use opentelemetry::{KeyValue, metrics::Meter};
+
+/// Wraps whatever fred client/pool type the rest of `service_utils` uses.
+/// The wrapping type implements `RedisStats` so the observability module
+/// does not have to know fred's concrete types.
+pub type RedisHandle = Arc<dyn RedisStats + Send + Sync>;
+
+/// Thin abstraction over the fred metrics surface so the saturation module
+/// is decoupled from fred's exact API.
+///
+/// Returning `None` from any getter simply omits the corresponding metric.
+/// This is intentional so that not-yet-wired stats do not break the build.
+pub trait RedisStats {
+    /// Number of clients in the pool with an active connection to Redis.
+    fn connected_connections(&self) -> Option<u64>;
+    fn commands_in_flight(&self) -> Option<u64>;
+}
+
+/// Implements `RedisStats` for the project's fred `RedisPool`.
+pub struct FredPoolStats(pub RedisPool);
+
+impl RedisStats for FredPoolStats {
+    fn connected_connections(&self) -> Option<u64> {
+        Some(self.0.clients().iter().filter(|c| c.is_connected()).count() as u64)
+    }
+
+    fn commands_in_flight(&self) -> Option<u64> {
+        // `command_queue_len()` is available via `MetricsInterface` on each
+        // `RedisClient`. It counts commands buffered in the client that have
+        // not yet been written to the network socket. Summing across all
+        // pool clients gives an approximate "pending work" measure.
+        let total: usize = self.0.clients().iter().map(|c| c.command_queue_len()).sum();
+        Some(total as u64)
+    }
+}
+
+pub fn register(meter: &Meter, client: RedisHandle, pool_name: &'static str) {
+    let pool_label = KeyValue::new("pool.name", pool_name);
+
+    let c = client.clone();
+    let label = pool_label.clone();
+    meter
+        .u64_observable_gauge("redis.client.connections.connected")
+        .with_description(
+            "Number of Redis client connections currently connected to the server.",
+        )
+        .with_callback(move |observer| {
+            if let Some(n) = c.connected_connections() {
+                observer.observe(n, std::slice::from_ref(&label));
+            }
+        })
+        .build();
+
+    let c = client.clone();
+    let label = pool_label.clone();
+    meter
+        .u64_observable_gauge("redis.client.commands.in_flight")
+        .with_description(
+            "Number of Redis commands currently buffered (waiting to be sent to the server).",
+        )
+        .with_callback(move |observer| {
+            if let Some(n) = c.commands_in_flight() {
+                observer.observe(n, std::slice::from_ref(&label));
+            }
+        })
+        .build();
+}
diff --git a/crates/service_utils/src/observability/saturation/tokio_runtime.rs b/crates/service_utils/src/observability/saturation/tokio_runtime.rs
new file mode 100644
index 000000000..597c3b26c
--- /dev/null
+++ b/crates/service_utils/src/observability/saturation/tokio_runtime.rs
@@ -0,0 +1,58 @@
+//! Tokio runtime saturation gauges.
+//!
+//! Reads `tokio::runtime::Handle::metrics()` directly from each observable
+//! callback — no background sampler, no atomics snapshot, no `RuntimeMonitor`.
+//! Worker count and global queue depth are stable instantaneous values; total
+//! busy time is exposed as a monotonic Counter (per-worker durations summed
+//! and reported in seconds), letting Prometheus compute the rate / saturation
+//! ratio at query time.
+//!
+//! No-op when not running on a Tokio runtime.
+
+use opentelemetry::metrics::Meter;
+
+pub fn register(meter: &Meter) {
+    let handle = match tokio::runtime::Handle::try_current() {
+        Ok(h) => h,
+        Err(_) => return,
+    };
+
+    let h = handle.clone();
+    meter
+        .u64_observable_gauge("runtime.tokio.workers")
+        .with_description("Number of tokio worker threads.")
+        .with_callback(move |observer| {
+            observer.observe(h.metrics().num_workers() as u64, &[]);
+        })
+        .build();
+
+    let h = handle.clone();
+    meter
+        .u64_observable_gauge("runtime.tokio.global_queue.depth")
+        .with_description("Tasks queued in the runtime's global injection queue.")
+        .with_callback(move |observer| {
+            observer.observe(h.metrics().global_queue_depth() as u64, &[]);
+        })
+        .build();
+
+    // `worker_total_busy_duration` requires 64-bit atomics; gate the
+    // instrument the same way tokio gates the method.
+    #[cfg(target_has_atomic = "64")]
+    {
+        let h = handle;
+        meter
+            .f64_observable_counter("runtime.tokio.workers.busy.time")
+            .with_unit("s")
+            .with_description(
+                "Cumulative time tokio worker threads have spent busy, summed across workers.",
+            )
+            .with_callback(move |observer| {
+                let m = h.metrics();
+                let total_secs: f64 = (0..m.num_workers())
+                    .map(|i| m.worker_total_busy_duration(i).as_secs_f64())
+                    .sum();
+                observer.observe(total_secs, &[]);
+            })
+            .build();
+    }
+}
diff --git a/crates/service_utils/tests/observability_integration.rs b/crates/service_utils/tests/observability_integration.rs
new file mode 100644
index 000000000..be005390f
--- /dev/null
+++ b/crates/service_utils/tests/observability_integration.rs
@@ -0,0 +1,213 @@
+//! End-to-end test: an Actix app wrapped with MetricsMiddleware serves several
+//! routes; we then issue requests and parse the Prometheus scrape output to
+//! assert on the metrics that should appear.
+
+use actix_web::{App, HttpResponse, http::StatusCode, test, web};
+use prometheus::Encoder;
+use service_utils::observability::{
+    LabelConfig, MetricsMiddleware, Observability, ObservabilityConfig, SaturationDeps,
+    register_observers,
+};
+
+fn cfg() -> ObservabilityConfig {
+    ObservabilityConfig {
+        enabled: true,
+        bind: "127.0.0.1".parse().unwrap(),
+        port: 0,
+        label: LabelConfig::default(),
+        collect_interval: std::time::Duration::from_secs(10),
+        instance_id: "it".into(),
+        service_name: "sp-it".into(),
+        service_version: "0".into(),
+        deployment_environment: None,
+        otlp_endpoint: None,
+    }
+}
+
+fn scrape(obs: &Observability) -> String {
+    let metric_families = obs.registry().gather();
+    let mut buf = Vec::new();
+    prometheus::TextEncoder::new()
+        .encode(&metric_families, &mut buf)
+        .unwrap();
+    String::from_utf8(buf).unwrap()
+}
+
+#[actix_web::test]
+async fn metrics_appear_after_requests() {
+    let obs = Observability::init(cfg()).unwrap();
+    let mw = MetricsMiddleware::new(&obs.meter(), LabelConfig::default());
+    let app = test::init_service(
+        App::new()
+            .wrap(mw)
+            .route(
+                "/ping",
+                web::get().to(|| async { HttpResponse::Ok().finish() }),
+            )
+            .route(
+                "/echo/{name}",
+                web::post().to(|p: web::Path<String>| async move {
+                    HttpResponse::Created().body(p.into_inner())
+                }),
+            )
+            .route(
+                "/boom",
+                web::get().to(|| async { HttpResponse::InternalServerError().finish() }),
+            ),
+    )
+    .await;
+
+    for _ in 0..3 {
+        let req = test::TestRequest::get().uri("/ping").to_request();
+        let resp = test::call_service(&app, req).await;
+        assert_eq!(resp.status(), StatusCode::OK);
+    }
+    let req = test::TestRequest::post().uri("/echo/world").to_request();
+    let resp = test::call_service(&app, req).await;
+    assert_eq!(resp.status(), StatusCode::CREATED);
+
+    let req = test::TestRequest::get().uri("/boom").to_request();
+    let resp = test::call_service(&app, req).await;
+    assert_eq!(resp.status(), StatusCode::INTERNAL_SERVER_ERROR);
+
+    let req = test::TestRequest::get().uri("/no-such-route").to_request();
+    let resp = test::call_service(&app, req).await;
+    assert_eq!(resp.status(), StatusCode::NOT_FOUND);
+
+    let body = scrape(&obs);
+
+    // Request duration histogram exists with expected labels for /ping (3 hits).
+    let ping_count_line = body
+        .lines()
+        .find(|l| {
+            l.starts_with("http_server_request_duration_seconds_count{")
+                && l.contains("http_route=\"/ping\"")
+                && l.contains("http_request_method=\"GET\"")
+                && l.contains("http_response_status_code=\"200\"")
+        })
+        .unwrap_or_else(|| panic!("no /ping count line in:\n{body}"));
+    let ping_count: f64 = ping_count_line
+        .rsplit_once(' ')
+        .unwrap()
+        .1
+        .trim()
+        .parse()
+        .unwrap();
+    assert_eq!(ping_count as u64, 3);
+
+    // 5xx series for /boom appears.
+    assert!(
+        body.lines().any(|l| {
+            l.starts_with("http_server_request_duration_seconds_count{")
+                && l.contains("http_route=\"/boom\"")
+                && l.contains("http_response_status_code=\"500\"")
+        }),
+        "no /boom 500 series in:\n{body}"
+    );
+
+    // Unmatched path uses the sentinel.
+    assert!(
+        body.lines().any(|l| {
+            l.starts_with("http_server_request_duration_seconds_count{")
+                && l.contains("http_route=\"__not_found__\"")
+        }),
+        "no __not_found__ series in:\n{body}"
+    );
+
+    // busy_duration_total > 0
+    let busy = body
+        .lines()
+        .find(|l| l.starts_with("http_server_busy_duration_seconds_total{"))
+        .unwrap_or_else(|| panic!("no busy_duration line in:\n{body}"));
+    let busy_value: f64 = busy.rsplit_once(' ').unwrap().1.trim().parse().unwrap();
+    assert!(
+        busy_value > 0.0,
+        "expected busy_duration > 0, got {busy_value}"
+    );
+
+    // active_requests returns to 0 after all requests complete.
+    let active_lines: Vec<_> = body
+        .lines()
+        .filter(|l| l.starts_with("http_server_active_requests{"))
+        .collect();
+    for line in &active_lines {
+        let v: f64 = line.rsplit_once(' ').unwrap().1.trim().parse().unwrap();
+        assert_eq!(v, 0.0, "active_requests not zero: {line}");
+    }
+}
+
+/// Sanity check that the saturation observers register and the tokio runtime
+/// gauges actually appear in the Prometheus scrape under a real tokio runtime.
+/// The values themselves come from `tokio::runtime::Handle::metrics()`; we
+/// just assert the wiring is intact (presence + plausible workers count).
+#[actix_web::test]
+async fn runtime_tokio_metrics_appear_after_register_observers() {
+    let obs = Observability::init(cfg()).unwrap();
+    register_observers(&obs.meter(), SaturationDeps::default()).unwrap();
+
+    let body = scrape(&obs);
+
+    let workers_line = body
+        .lines()
+        .find(|l| l.starts_with("runtime_tokio_workers "))
+        .unwrap_or_else(|| panic!("no runtime_tokio_workers in:\n{body}"));
+    let workers: f64 = workers_line.rsplit_once(' ').unwrap().1.trim().parse().unwrap();
+    assert!(workers >= 1.0, "expected >=1 worker, got {workers}");
+
+    assert!(
+        body.lines().any(|l| l.starts_with("runtime_tokio_global_queue_depth ")),
+        "no runtime_tokio_global_queue_depth in:\n{body}"
+    );
+    assert!(
+        body.lines()
+            .any(|l| l.starts_with("runtime_tokio_workers_busy_time_seconds_total ")),
+        "no runtime_tokio_workers_busy_time_seconds_total in:\n{body}"
+    );
+}
+
+#[actix_web::test]
+async fn cardinality_stays_within_budget() {
+    let obs = Observability::init(cfg()).unwrap();
+    let mw = MetricsMiddleware::new(&obs.meter(), LabelConfig::default());
+    let app = test::init_service(
+        App::new()
+            .wrap(mw)
+            .route(
+                "/a",
+                web::get().to(|| async { HttpResponse::Ok().finish() }),
+            )
+            .route(
+                "/b",
+                web::get().to(|| async { HttpResponse::Ok().finish() }),
+            )
+            .route(
+                "/c",
+                web::post().to(|| async { HttpResponse::Created().finish() }),
+            ),
+    )
+    .await;
+
+    for _ in 0..10 {
+        for path in &["/a", "/b"] {
+            let req = test::TestRequest::get().uri(path).to_request();
+            let _ = test::call_service(&app, req).await;
+        }
+        let req = test::TestRequest::post().uri("/c").to_request();
+        let _ = test::call_service(&app, req).await;
+    }
+
+    let body = scrape(&obs);
+    let series = body
+        .lines()
+        .filter(|l| !l.is_empty() && !l.starts_with('#'))
+        .count();
+
+    // Budget for this scenario: 3 routes × 1 method each × 1 status × ~12
+    // (10 buckets + sum + count) = ~36 series for the histogram, plus 3 for
+    // busy_duration, plus 1 for active_requests, plus a few from `target_info`
+    // that the prometheus exporter emits. Headroom: 200.
+    assert!(
+        series <= 200,
+        "cardinality regression: {series} series\n{body}"
+    );
+}
diff --git a/crates/superposition/Cargo.toml b/crates/superposition/Cargo.toml
index 26fa4bada..60f613c9c 100644
--- a/crates/superposition/Cargo.toml
+++ b/crates/superposition/Cargo.toml
@@ -42,6 +42,8 @@ tracing = { workspace = true }
 tracing-subscriber = { workspace = true }
 tracing-actix-web = { workspace = true }
 json-subscriber = { version = "0.2.7", features = ["tracing-log"] }
+tokio = { workspace = true }
+opentelemetry = { workspace = true }
 
 [lints]
 workspace = true
diff --git a/crates/superposition/src/app_state.rs b/crates/superposition/src/app_state.rs
index 60a89b515..d6d84d978 100644
--- a/crates/superposition/src/app_state.rs
+++ b/crates/superposition/src/app_state.rs
@@ -102,13 +102,12 @@ pub async fn get(
         },
         snowflake_generator,
         app_env,
-        tenant_middleware_exclusion_list: get_from_env_unsafe::<String>(
-            "TENANT_MIDDLEWARE_EXCLUSION_LIST",
-        )
-        .expect("TENANT_MIDDLEWARE_EXCLUSION_LIST is not set")
-        .split(',')
-        .map(String::from)
-        .collect::<HashSet<_>>(),
+        tenant_middleware_exclusion_list:
+            get_from_env_unsafe::<String>("TENANT_MIDDLEWARE_EXCLUSION_LIST")
+                .expect("TENANT_MIDDLEWARE_EXCLUSION_LIST is not set")
+                .split(',')
+                .map(String::from)
+                .collect::<HashSet<_>>(),
         service_prefix,
         superposition_token: get_superposition_token(kms_client, &app_env).await,
         redis: redis_pool,
diff --git a/crates/superposition/src/main.rs b/crates/superposition/src/main.rs
index 44d32f407..510ba6380 100644
--- a/crates/superposition/src/main.rs
+++ b/crates/superposition/src/main.rs
@@ -6,7 +6,7 @@ mod resolve;
 mod webhooks;
 mod workspace;
 
-use std::{io::Result, time::Duration};
+use std::{io::Result, sync::Arc, time::Duration};
 
 use actix_files::Files;
 use actix_web::{
@@ -31,6 +31,10 @@ use service_utils::{
         request_response_logging::RequestResponseLogger,
         workspace_context::OrgWorkspaceMiddlewareFactory,
     },
+    observability::{
+        FredPoolStats, MetricsMiddleware, Observability, ObservabilityConfig, RedisStats,
+        SaturationDeps, register_observers, spawn_metrics_server,
+    },
     service::types::AppEnv,
 };
 use superposition_macros::bad_argument;
@@ -77,6 +81,29 @@ async fn main() -> Result<()> {
         )
         .init();
 
+    // --- Step 1: Observability init (early, before AppState build) ---
+    // `from_env` errors are operator-config mistakes (bad port, bad IP, etc.) — fail loudly.
+    let obs_cfg =
+        ObservabilityConfig::from_env().expect("invalid observability env config");
+    // `Observability::init` may fail transiently (e.g. OTLP endpoint unreachable at startup).
+    // Rather than killing the binary we log a warning and serve traffic without metrics.
+    let observability = if obs_cfg.enabled {
+        match Observability::init(obs_cfg.clone()) {
+            Ok(o) => Some(o),
+            Err(e) => {
+                tracing::warn!(
+                    error = %e,
+                    "observability init failed; metrics disabled for this instance"
+                );
+                None
+            }
+        }
+    } else {
+        None
+    };
+    // Reflect actual init outcome: obs_enabled is true only when we have a live Observability.
+    let obs_enabled = observability.is_some();
+
     let service_prefix: String =
         get_from_env_unsafe("SERVICE_PREFIX").expect("SERVICE_PREFIX is not set");
 
@@ -142,11 +169,59 @@ async fn main() -> Result<()> {
         .await,
     );
 
+    // --- Step 2: Register saturation observers ---
+    // app_state.db_pool is PgSchemaConnectionPool (= Pool<ConnectionManager<PgConnection>>),
+    // not Arc-wrapped, so we wrap it here.
+    // app_state.redis is Option<fred::clients::RedisPool>.
+    let redis_handle: Option<Arc<dyn RedisStats + Send + Sync>> =
+        app_state.redis.as_ref().map(|pool| {
+            Arc::new(FredPoolStats(pool.clone())) as Arc<dyn RedisStats + Send + Sync>
+        });
+
+    if let Some(obs) = observability.as_ref() {
+        let deps = SaturationDeps {
+            db_pool: Some(Arc::new(app_state.db_pool.clone())),
+            redis_client: redis_handle,
+        };
+        register_observers(&obs.meter(), deps)
+            .expect("saturation observer registration failed");
+    }
+
+    // --- Step 3: Spawn the metrics server ---
+    let metrics_server_handle = if let Some(obs) = observability.as_ref() {
+        let bind = std::net::SocketAddr::new(obs_cfg.bind, obs_cfg.port);
+        match spawn_metrics_server(obs.registry(), bind) {
+            Ok(h) => Some(h),
+            Err(e) => {
+                tracing::warn!(
+                    error = %e,
+                    bind = %bind,
+                    "metrics server bind failed; /metrics endpoint disabled for this instance"
+                );
+                None
+            }
+        }
+    } else {
+        None
+    };
+
+    // --- Step 4: Capture meter + label_cfg for the closure ---
+    // When obs_enabled is true, observability is Some and meter() is valid.
+    // When obs_enabled is false, we still need a Meter instance to construct
+    // MetricsMiddleware inside the closure (Condition evaluates its argument
+    // regardless of the flag). We use the global noop meter in that case.
+    let metrics_meter = observability
+        .as_ref()
+        .map(|o| o.meter())
+        .unwrap_or_else(|| opentelemetry::global::meter("superposition-noop"));
+    let metrics_label_cfg = obs_cfg.label;
+
     let auth_n = AuthNHandler::init(&kms_client, &app_env, base.clone()).await;
     let auth_z = AuthZHandler::init(&kms_client, &app_env).await;
     let auth_z_manager = AuthZManager::init(&kms_client, &app_env).await;
 
-    HttpServer::new(move || {
+    // --- Step 5: Build and run both servers concurrently ---
+    let main_server = HttpServer::new(move || {
         let leptos_options = &conf.leptos_options;
         let site_root = &leptos_options.site_root;
         let leptos_envs = ui_envs.clone();
@@ -216,6 +291,13 @@ async fn main() -> Result<()> {
             ))
             // Conditionally add request/response logging middleware for development
             .wrap(RequestResponseLogger)
+            // MetricsMiddleware gated by SUPERPOSITION_METRICS_ENABLED (Approach B: Condition).
+            // metrics_meter is a real Meter when enabled, or a noop Meter when disabled.
+            .wrap(Condition::new(
+                obs_enabled,
+                MetricsMiddleware::new(&metrics_meter, metrics_label_cfg),
+            ))
+            // TracingLogger is outermost — last .wrap() runs first on requests.
             .wrap(TracingLogger::<CustomRootSpanBuilder>::new())
     })
     .bind(("0.0.0.0", cac_port))?
@@ -223,8 +305,24 @@ async fn main() -> Result<()> {
     .keep_alive(Duration::from_secs(
         get_from_env_unsafe("ACTIX_KEEP_ALIVE").unwrap_or(120),
     ))
-    .run()
-    .await
+    .run();
+
+    // --- Step 6: Run the main server; metrics server is a detached best-effort task ---
+    // Using try_join! would abort the main API server if the metrics task ever returned
+    // an error (port reclaimed, listener closed). That contradicts the "metrics are
+    // best-effort" stance applied throughout. Detach instead and log on error.
+    if let Some(metrics_handle) = metrics_server_handle {
+        tokio::spawn(async move {
+            if let Err(e) = metrics_handle.await {
+                tracing::warn!(
+                    error = %e,
+                    "metrics server exited with error; /metrics endpoint is now unavailable"
+                );
+            }
+        });
+    }
+    main_server.await?;
+    Ok(())
 }
 
 trait ScopeExt {
diff --git a/crates/superposition_types/src/database/models.rs b/crates/superposition_types/src/database/models.rs
index 1af12e70e..c6544ba24 100644
--- a/crates/superposition_types/src/database/models.rs
+++ b/crates/superposition_types/src/database/models.rs
@@ -9,8 +9,8 @@ use chrono::{DateTime, Utc};
 use derive_more::{Deref, DerefMut};
 #[cfg(feature = "diesel_derives")]
 use diesel::{
-    AsChangeset, AsExpression, FromSqlRow, Insertable, QueryId, Queryable, Selectable,
     sql_types::{Json, Text},
+    AsChangeset, AsExpression, FromSqlRow, Insertable, QueryId, Queryable, Selectable,
 };
 use serde::{Deserialize, Deserializer, Serialize};
 #[cfg(all(
@@ -23,10 +23,10 @@ use superposition_derives::TextFromSqlNoValidation;
 #[cfg(feature = "diesel_derives")]
 use superposition_derives::{JsonFromSql, JsonToSql, TextToSql};
 
-#[cfg(feature = "disable_db_data_validation")]
-use super::DisableDBValidation;
 #[cfg(feature = "diesel_derives")]
 use super::superposition_schema::superposition::*;
+#[cfg(feature = "disable_db_data_validation")]
+use super::DisableDBValidation;
 
 #[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Deref, DerefMut)]
 #[serde(try_from = "String")]
diff --git a/docs/superpowers/plans/2026-05-10-otel-golden-signals-middleware.md b/docs/superpowers/plans/2026-05-10-otel-golden-signals-middleware.md
new file mode 100644
index 000000000..956ef35f6
--- /dev/null
+++ b/docs/superpowers/plans/2026-05-10-otel-golden-signals-middleware.md
@@ -0,0 +1,2386 @@
+# OpenTelemetry Golden-Signals Middleware Implementation Plan
+
+> **Status:** Plan executed; shipped with deviations. See "Post-implementation deviations" below before treating any specific task as ground truth. The plan body is preserved as historical record of original intent.
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+## Post-implementation deviations
+
+The PR shipped with the following changes versus this plan. Affected tasks are flagged inline; the canonical summary is in §0 of the design spec.
+
+- **Tasks 2, 12, 16, 17 changed substantially or obsolete.**
+  - **Task 2 (`.cargo/config.toml` for `tokio_unstable`):** not needed. Tokio 1.50 exposes the runtime metrics APIs we use as stable. The file was not created (or removed if already present).
+  - **Task 12 (`/healthz` `/livez` `/readyz` handlers) + Task 17 (auth-bypass exclusion):** dropped. The pre-existing `GET /health` covers the up-check role; the k8s liveness/readiness split is deferred to a follow-up PR.
+  - **Task 16 (Tokio runtime saturation):** rewritten. No background sampler, no `RuntimeMonitor`, no atomics snapshot, no `tokio-metrics` dep. Each observable callback reads `Handle::metrics()` directly. `runtime.tokio.workers.busy_ratio` (Gauge) is replaced with `runtime.tokio.workers.busy.time` (monotonic Counter, seconds, summed across workers); Prometheus computes saturation at query time.
+- **Task 1 / 3:** `opentelemetry-semantic-conventions` dep is **not** in the final tree — attribute names are used as literals. `tokio-metrics` was added then removed when Task 16 was rewritten.
+- **Task 18 (`main.rs` wiring):** `SaturationDeps` no longer has `tokio_collect_interval`; the `.configure(configure_health_endpoints)` line is not present.
+- **Task 21 (README + makefile):** the `tokio_unstable` build-flag note is removed; no makefile flag changes are required.
+
+**Goal:** Add an Actix middleware and supporting subsystem to `crates/service_utils` that emits Google SRE golden signals (latency, traffic, errors, saturation) for every HTTP route on the main API, exposed via Prometheus scrape on a dedicated port and optional OTLP push, using OpenTelemetry.
+
+**Architecture:** New `service_utils::observability` module owns: (a) `init()` that builds an OTel `MeterProvider` with a Prometheus exporter and an optional OTLP exporter, (b) an Actix `MetricsMiddleware` that records `http.server.request.duration` (histogram), `http.server.busy.duration` (counter), and `http.server.active_requests` (UpDownCounter) for every request, (c) saturation collectors for r2d2 DB pool, fred Redis pool, and (cfg-gated) tokio-metrics, and (d) a separate `HttpServer` on `SUPERPOSITION_METRICS_PORT` exposing `/metrics`. Health endpoints `/healthz`/`/livez`/`/readyz` mount on the main port and bypass auth via the existing `tenant_middleware_exclusion_list`.
+
+**Tech Stack:** Rust, Actix-web 4, OpenTelemetry SDK 0.27 (`opentelemetry`, `opentelemetry_sdk`, `opentelemetry-prometheus`, `opentelemetry-otlp`, `opentelemetry-semantic-conventions`), `prometheus` 0.13, `tokio-metrics` 0.3 (under `cfg(tokio_unstable)`), Diesel/r2d2, fred (Redis client).
+
+**Spec:** [`docs/superpowers/specs/2026-05-10-otel-golden-signals-middleware-design.md`](../specs/2026-05-10-otel-golden-signals-middleware-design.md)
+
+**Notes for the implementer:**
+
+- The OpenTelemetry Rust SDK has had API churn between minor versions. The exact import paths and builder method names below are written against `opentelemetry` 0.27. If you pin a different version in Task 1, expect to adjust 1–3 import paths or method names per call site. The plan uses the **stable** APIs only (no unstable/preview features).
+- After Task 1 (deps), run `cargo check -p service_utils` after every code-touching task to catch wiring issues early — even on tasks that don't add tests yet.
+- File commit boundary: each task ends with one commit. If a step within a task fails, fix and continue within the same task before committing.
+- All paths in this plan are relative to the repo root: `<repo-root>/`.
+
+---
+
+## Task 1: Add workspace dependencies
+
+**Files:**
+- Modify: `Cargo.toml` (root)
+
+- [ ] **Step 1: Add OpenTelemetry deps to `[workspace.dependencies]`**
+
+Add the following block to the `[workspace.dependencies]` section of the root `Cargo.toml` (alphabetical order, near the existing entries like `prometheus`-adjacent / `opentelemetry`-adjacent slots):
+
+```toml
+opentelemetry = { version = "0.27", default-features = false, features = ["metrics"] }
+opentelemetry_sdk = { version = "0.27", default-features = false, features = ["metrics", "rt-tokio"] }
+opentelemetry-prometheus = { version = "0.27", default-features = false }
+opentelemetry-otlp = { version = "0.27", default-features = false, features = ["metrics", "http-proto", "reqwest-client"] }
+opentelemetry-semantic-conventions = { version = "0.27" }
+prometheus = { version = "0.13", default-features = false }
+tokio-metrics = { version = "0.3", default-features = false, features = ["rt"] }
+humantime = "2.1"
+```
+
+- [ ] **Step 2: Verify the workspace still resolves**
+
+Run: `cargo metadata --format-version 1 > /dev/null`
+Expected: exit code 0, no errors. (This forces `cargo` to re-resolve the workspace without compiling.)
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add Cargo.toml
+git commit -m "build: add opentelemetry deps to workspace
+
+Adds opentelemetry, opentelemetry_sdk, opentelemetry-prometheus,
+opentelemetry-otlp, opentelemetry-semantic-conventions, prometheus,
+tokio-metrics, and humantime as workspace dependencies. Enabled in
+service_utils in a follow-up commit.
+"
+```
+
+---
+
+## Task 2: Add `.cargo/config.toml` for `tokio_unstable`
+
+**Files:**
+- Create: `.cargo/config.toml`
+
+- [ ] **Step 1: Create the file**
+
+```toml
+# Required by tokio-metrics for runtime instrumentation. Affects all
+# crates in the workspace; only the saturation::tokio_runtime module
+# consumes the additional APIs that this flag unlocks.
+[build]
+rustflags = ["--cfg", "tokio_unstable"]
+```
+
+- [ ] **Step 2: Verify the workspace still builds**
+
+Run: `cargo check --workspace`
+Expected: exit code 0. (Build may take a while on first run; that's fine.)
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add .cargo/config.toml
+git commit -m "build: enable tokio_unstable workspace-wide
+
+Required by tokio-metrics for runtime instrumentation introduced in
+the upcoming observability subsystem. tokio_unstable only adds APIs;
+no behavioural change for existing code.
+"
+```
+
+---
+
+## Task 3: Enable observability deps in `service_utils`
+
+**Files:**
+- Modify: `crates/service_utils/Cargo.toml`
+
+- [ ] **Step 1: Add the dependency lines**
+
+Append to the `[dependencies]` block of `crates/service_utils/Cargo.toml` (after the existing entries, preserving alphabetical-ish order):
+
+```toml
+opentelemetry = { workspace = true }
+opentelemetry_sdk = { workspace = true }
+opentelemetry-prometheus = { workspace = true }
+opentelemetry-otlp = { workspace = true }
+opentelemetry-semantic-conventions = { workspace = true }
+prometheus = { workspace = true }
+tokio-metrics = { workspace = true }
+humantime = { workspace = true }
+```
+
+(`fred` already has the `metrics` feature enabled at line 22 — no change needed there.)
+
+- [ ] **Step 2: Verify the crate still compiles**
+
+Run: `cargo check -p service_utils`
+Expected: exit code 0.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add crates/service_utils/Cargo.toml
+git commit -m "build(service_utils): enable opentelemetry deps"
+```
+
+---
+
+## Task 4: Module skeleton in `service_utils`
+
+**Files:**
+- Modify: `crates/service_utils/src/lib.rs`
+- Create: `crates/service_utils/src/observability.rs`
+
+- [ ] **Step 1: Add the `pub mod` line**
+
+Edit `crates/service_utils/src/lib.rs`, adding a new line in alphabetical position:
+
+```rust
+pub mod aws;
+pub mod db;
+pub mod encryption;
+pub mod extensions;
+pub mod helpers;
+pub mod middlewares;
+pub mod observability;   // <-- NEW LINE, between middlewares and redis
+pub mod redis;
+pub mod registry;
+pub mod service;
+```
+
+- [ ] **Step 2: Create `observability.rs` with public surface stubs**
+
+```rust
+//! HTTP golden-signals metrics exposition via OpenTelemetry.
+//!
+//! See `docs/superpowers/specs/2026-05-10-otel-golden-signals-middleware-design.md`.
+
+mod config;
+mod health;
+mod meters;
+mod metrics_server;
+mod middleware;
+mod saturation;
+
+pub use config::{LabelConfig, ObservabilityConfig};
+pub use health::{health_endpoint_paths, health_endpoints};
+pub use metrics_server::spawn_metrics_server;
+pub use middleware::MetricsMiddleware;
+pub use saturation::{register_observers, SaturationDeps};
+
+use std::sync::Arc;
+
+use opentelemetry::metrics::Meter;
+use opentelemetry_sdk::metrics::SdkMeterProvider;
+use prometheus::Registry;
+use thiserror::Error;
+
+#[derive(Debug, Error)]
+pub enum ObservabilityError {
+    #[error("prometheus exporter init failed: {0}")]
+    PrometheusInit(String),
+    #[error("otlp exporter init failed: {0}")]
+    OtlpInit(String),
+    #[error("config error: {0}")]
+    Config(String),
+    #[error(transparent)]
+    Io(#[from] std::io::Error),
+}
+
+pub struct Observability {
+    provider: SdkMeterProvider,
+    registry: Arc<Registry>,
+    meter: Meter,
+}
+
+impl Observability {
+    pub fn meter(&self) -> Meter {
+        self.meter.clone()
+    }
+
+    pub fn registry(&self) -> Arc<Registry> {
+        self.registry.clone()
+    }
+
+    pub fn shutdown(self) -> Result<(), ObservabilityError> {
+        self.provider
+            .shutdown()
+            .map_err(|e| ObservabilityError::PrometheusInit(e.to_string()))
+    }
+
+    pub fn init(_cfg: ObservabilityConfig) -> Result<Self, ObservabilityError> {
+        // Real implementation lands in Task 7.
+        unimplemented!("Observability::init implemented in Task 7")
+    }
+}
+```
+
+(`thiserror` is already used elsewhere in the workspace; if `service_utils/Cargo.toml` does not yet depend on it, add `thiserror = "1"` to the `[dependencies]` block. Quick check: `grep thiserror crates/service_utils/Cargo.toml`.)
+
+- [ ] **Step 3: Verify it compiles**
+
+Run: `cargo check -p service_utils`
+Expected: exit code 0. The stub modules listed in the `mod` declarations don't exist yet, so compile may fail — proceed to step 4 if so.
+
+- [ ] **Step 4: Create empty stub files for child modules so this task ends compilable**
+
+Create each of:
+
+- `crates/service_utils/src/observability/config.rs`:
+  ```rust
+  //! Stub — real implementation in Task 5.
+  pub struct ObservabilityConfig;
+  pub struct LabelConfig;
+  ```
+- `crates/service_utils/src/observability/meters.rs`:
+  ```rust
+  //! Stub — real implementation in Task 7.
+  ```
+- `crates/service_utils/src/observability/middleware.rs`:
+  ```rust
+  //! Stub — real implementation in Task 11.
+  pub struct MetricsMiddleware;
+  ```
+- `crates/service_utils/src/observability/metrics_server.rs`:
+  ```rust
+  //! Stub — real implementation in Task 13.
+  use std::{net::SocketAddr, sync::Arc};
+  use prometheus::Registry;
+  pub fn spawn_metrics_server(
+      _registry: Arc<Registry>,
+      _bind: SocketAddr,
+  ) -> std::io::Result<actix_web::dev::Server> {
+      unimplemented!("Task 13")
+  }
+  ```
+- `crates/service_utils/src/observability/health.rs`:
+  ```rust
+  //! Stub — real implementation in Task 12.
+  pub fn health_endpoints() -> actix_web::Scope {
+      actix_web::web::scope("")
+  }
+  pub fn health_endpoint_paths() -> &'static [&'static str] {
+      &[]
+  }
+  ```
+- `crates/service_utils/src/observability/saturation.rs`:
+  ```rust
+  //! Stub — real implementation in Task 14.
+  use opentelemetry::metrics::Meter;
+  pub struct SaturationDeps;
+  pub fn register_observers(
+      _meter: &Meter,
+      _deps: SaturationDeps,
+  ) -> Result<(), super::ObservabilityError> {
+      Ok(())
+  }
+  ```
+
+- [ ] **Step 5: Verify it compiles**
+
+Run: `cargo check -p service_utils`
+Expected: exit code 0.
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add crates/service_utils/src/lib.rs crates/service_utils/src/observability.rs crates/service_utils/src/observability/
+git commit -m "feat(observability): module skeleton
+
+Adds the empty module structure that subsequent commits flesh out:
+- observability.rs: public surface, Observability handle, errors
+- observability/{config,meters,middleware,metrics_server,health,saturation}.rs: stubs
+
+No behaviour change. The Observability::init() body is unimplemented!()
+until Task 7.
+"
+```
+
+---
+
+## Task 5: `ObservabilityConfig` from env
+
+**Files:**
+- Modify: `crates/service_utils/src/observability/config.rs`
+
+- [ ] **Step 1: Write the failing test**
+
+Replace the contents of `crates/service_utils/src/observability/config.rs` with:
+
+```rust
+//! Configuration for the observability subsystem, parsed from env vars.
+
+use std::{net::IpAddr, str::FromStr, time::Duration};
+
+#[derive(Debug, Clone)]
+pub struct ObservabilityConfig {
+    pub enabled: bool,
+    pub bind: IpAddr,
+    pub port: u16,
+    pub label: LabelConfig,
+    pub collect_interval: Duration,
+    pub instance_id: String,
+    pub service_name: String,
+    pub service_version: String,
+    pub deployment_environment: Option<String>,
+    pub otlp_endpoint: Option<String>,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct LabelConfig {
+    pub with_org_label: bool,
+    pub with_workspace_label: bool,
+}
+
+impl Default for LabelConfig {
+    fn default() -> Self {
+        Self { with_org_label: true, with_workspace_label: true }
+    }
+}
+
+impl ObservabilityConfig {
+    pub fn from_env() -> Result<Self, String> {
+        fn env_bool(key: &str, default: bool) -> Result<bool, String> {
+            match std::env::var(key) {
+                Ok(v) => v.parse::<bool>().map_err(|_| format!("{key} must be true or false")),
+                Err(_) => Ok(default),
+            }
+        }
+        fn env_str(key: &str, default: &str) -> String {
+            std::env::var(key).unwrap_or_else(|_| default.to_owned())
+        }
+        fn env_opt(key: &str) -> Option<String> {
+            std::env::var(key).ok().filter(|s| !s.is_empty())
+        }
+
+        let enabled = env_bool("SUPERPOSITION_METRICS_ENABLED", true)?;
+        let bind = IpAddr::from_str(&env_str("SUPERPOSITION_METRICS_BIND", "0.0.0.0"))
+            .map_err(|e| format!("SUPERPOSITION_METRICS_BIND: {e}"))?;
+        let port: u16 = env_str("SUPERPOSITION_METRICS_PORT", "9091")
+            .parse()
+            .map_err(|e| format!("SUPERPOSITION_METRICS_PORT: {e}"))?;
+        let with_org_label = env_bool("SUPERPOSITION_METRICS_LABEL_ORG", true)?;
+        let with_workspace_label = env_bool("SUPERPOSITION_METRICS_LABEL_WORKSPACE", true)?;
+        let collect_interval =
+            humantime::parse_duration(&env_str("SUPERPOSITION_METRICS_COLLECT_INTERVAL", "10s"))
+                .map_err(|e| format!("SUPERPOSITION_METRICS_COLLECT_INTERVAL: {e}"))?;
+        let instance_id = env_opt("SUPERPOSITION_INSTANCE_ID")
+            .or_else(|| hostname_or_none())
+            .unwrap_or_else(|| "unknown".to_owned());
+        let service_name = env_str("OTEL_SERVICE_NAME", "superposition");
+        let service_version = env!("CARGO_PKG_VERSION").to_owned();
+        let deployment_environment = env_opt("APP_ENV").or_else(|| env_opt("DEPLOYMENT_ENV"));
+        let otlp_endpoint = env_opt("OTEL_EXPORTER_OTLP_ENDPOINT");
+
+        Ok(Self {
+            enabled,
+            bind,
+            port,
+            label: LabelConfig { with_org_label, with_workspace_label },
+            collect_interval,
+            instance_id,
+            service_name,
+            service_version,
+            deployment_environment,
+            otlp_endpoint,
+        })
+    }
+}
+
+fn hostname_or_none() -> Option<String> {
+    // Avoid pulling in a hostname crate; read /etc/hostname on Linux/macOS.
+    std::fs::read_to_string("/etc/hostname")
+        .ok()
+        .map(|s| s.trim().to_owned())
+        .filter(|s| !s.is_empty())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Tests run sequentially via `serial_test` to avoid env races.
+    /// We use a simple lock + a env-snapshot helper instead of adding a new dep.
+    fn with_env<F: FnOnce()>(vars: &[(&str, Option<&str>)], f: F) {
+        use std::sync::Mutex;
+        static LOCK: Mutex<()> = Mutex::new(());
+        let _guard = LOCK.lock().unwrap();
+        let prev: Vec<_> =
+            vars.iter().map(|(k, _)| (k.to_string(), std::env::var(k).ok())).collect();
+        for (k, v) in vars {
+            match v {
+                Some(v) => std::env::set_var(k, v),
+                None => std::env::remove_var(k),
+            }
+        }
+        f();
+        for (k, v) in prev {
+            match v {
+                Some(v) => std::env::set_var(&k, &v),
+                None => std::env::remove_var(&k),
+            }
+        }
+    }
+
+    #[test]
+    fn defaults_when_unset() {
+        with_env(
+            &[
+                ("SUPERPOSITION_METRICS_ENABLED", None),
+                ("SUPERPOSITION_METRICS_PORT", None),
+                ("SUPERPOSITION_METRICS_BIND", None),
+                ("SUPERPOSITION_METRICS_LABEL_ORG", None),
+                ("SUPERPOSITION_METRICS_LABEL_WORKSPACE", None),
+                ("SUPERPOSITION_METRICS_COLLECT_INTERVAL", None),
+                ("OTEL_EXPORTER_OTLP_ENDPOINT", None),
+                ("OTEL_SERVICE_NAME", None),
+            ],
+            || {
+                let cfg = ObservabilityConfig::from_env().unwrap();
+                assert!(cfg.enabled);
+                assert_eq!(cfg.port, 9091);
+                assert_eq!(cfg.bind.to_string(), "0.0.0.0");
+                assert!(cfg.label.with_org_label);
+                assert!(cfg.label.with_workspace_label);
+                assert_eq!(cfg.collect_interval, Duration::from_secs(10));
+                assert_eq!(cfg.service_name, "superposition");
+                assert_eq!(cfg.otlp_endpoint, None);
+            },
+        );
+    }
+
+    #[test]
+    fn explicit_overrides() {
+        with_env(
+            &[
+                ("SUPERPOSITION_METRICS_ENABLED", Some("false")),
+                ("SUPERPOSITION_METRICS_PORT", Some("9999")),
+                ("SUPERPOSITION_METRICS_BIND", Some("127.0.0.1")),
+                ("SUPERPOSITION_METRICS_LABEL_WORKSPACE", Some("false")),
+                ("SUPERPOSITION_METRICS_COLLECT_INTERVAL", Some("30s")),
+                ("OTEL_EXPORTER_OTLP_ENDPOINT", Some("http://collector:4318")),
+                ("OTEL_SERVICE_NAME", Some("sp-test")),
+            ],
+            || {
+                let cfg = ObservabilityConfig::from_env().unwrap();
+                assert!(!cfg.enabled);
+                assert_eq!(cfg.port, 9999);
+                assert_eq!(cfg.bind.to_string(), "127.0.0.1");
+                assert!(cfg.label.with_org_label); // default still true
+                assert!(!cfg.label.with_workspace_label);
+                assert_eq!(cfg.collect_interval, Duration::from_secs(30));
+                assert_eq!(cfg.otlp_endpoint.as_deref(), Some("http://collector:4318"));
+                assert_eq!(cfg.service_name, "sp-test");
+            },
+        );
+    }
+
+    #[test]
+    fn malformed_port_errors() {
+        with_env(
+            &[("SUPERPOSITION_METRICS_PORT", Some("not-a-number"))],
+            || {
+                let err = ObservabilityConfig::from_env().unwrap_err();
+                assert!(err.contains("SUPERPOSITION_METRICS_PORT"));
+            },
+        );
+    }
+}
+```
+
+- [ ] **Step 2: Run the tests**
+
+Run: `cargo test -p service_utils observability::config -- --test-threads=1`
+Expected: 3 tests pass.
+
+(`--test-threads=1` is required because the tests mutate process env vars; the in-test mutex covers same-binary races but doctests/other tests in parallel could interleave.)
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add crates/service_utils/src/observability/config.rs
+git commit -m "feat(observability): config from env
+
+Reads SUPERPOSITION_METRICS_* and OTEL_* env vars into a typed
+ObservabilityConfig. Defaults: enabled, port 9091, bind 0.0.0.0,
+both org/workspace labels on, 10s collect interval.
+"
+```
+
+---
+
+## Task 6: HTTP method normalization helper
+
+**Files:**
+- Modify: `crates/service_utils/src/observability/middleware.rs`
+
+- [ ] **Step 1: Replace stub with TDD scaffold**
+
+```rust
+//! Actix middleware that records OpenTelemetry HTTP server metrics.
+
+/// Per OpenTelemetry HTTP semantic conventions, only known methods get their
+/// literal name; anything else collapses to `_OTHER`. Prevents weirdo clients
+/// from blowing up the cardinality of the `http.request.method` attribute.
+pub(crate) fn normalize_method(m: &actix_web::http::Method) -> &'static str {
+    match m.as_str() {
+        "GET" => "GET",
+        "POST" => "POST",
+        "PUT" => "PUT",
+        "DELETE" => "DELETE",
+        "PATCH" => "PATCH",
+        "HEAD" => "HEAD",
+        "OPTIONS" => "OPTIONS",
+        "TRACE" => "TRACE",
+        "CONNECT" => "CONNECT",
+        _ => "_OTHER",
+    }
+}
+
+pub struct MetricsMiddleware;   // placeholder until Task 11
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use actix_web::http::Method;
+
+    #[test]
+    fn known_methods_pass_through() {
+        for (m, expected) in [
+            (Method::GET, "GET"),
+            (Method::POST, "POST"),
+            (Method::PUT, "PUT"),
+            (Method::DELETE, "DELETE"),
+            (Method::PATCH, "PATCH"),
+            (Method::HEAD, "HEAD"),
+            (Method::OPTIONS, "OPTIONS"),
+            (Method::TRACE, "TRACE"),
+            (Method::CONNECT, "CONNECT"),
+        ] {
+            assert_eq!(normalize_method(&m), expected);
+        }
+    }
+
+    #[test]
+    fn unknown_methods_collapse_to_other() {
+        let m = Method::from_bytes(b"XPROPFIND").unwrap();
+        assert_eq!(normalize_method(&m), "_OTHER");
+        let m = Method::from_bytes(b"WEIRDO").unwrap();
+        assert_eq!(normalize_method(&m), "_OTHER");
+    }
+}
+```
+
+- [ ] **Step 2: Run the tests**
+
+Run: `cargo test -p service_utils observability::middleware`
+Expected: 2 tests pass.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add crates/service_utils/src/observability/middleware.rs
+git commit -m "feat(observability): http method normalization
+
+Per OTel semconv: collapse unknown methods to _OTHER to bound the
+cardinality of the http.request.method label.
+"
+```
+
+---
+
+## Task 7: `Observability::init()` with Prometheus exporter
+
+**Files:**
+- Modify: `crates/service_utils/src/observability.rs`
+- Modify: `crates/service_utils/src/observability/meters.rs`
+
+- [ ] **Step 1: Define `HttpMeters` struct**
+
+Replace `crates/service_utils/src/observability/meters.rs`:
+
+```rust
+//! Typed handles for the metric instruments emitted by the HTTP middleware.
+
+use opentelemetry::metrics::{Counter, Histogram, Meter, UpDownCounter};
+
+/// Histogram + counter + gauge for HTTP server golden signals. Built once at
+/// startup and cloned cheaply; instruments are `Arc<>`-backed internally.
+#[derive(Clone)]
+pub struct HttpMeters {
+    pub request_duration: Histogram<f64>,
+    pub busy_duration: Counter<f64>,
+    pub active_requests: UpDownCounter<i64>,
+}
+
+impl HttpMeters {
+    pub fn new(meter: &Meter) -> Self {
+        let request_duration = meter
+            .f64_histogram("http.server.request.duration")
+            .with_unit("s")
+            .with_description("Duration of HTTP server requests, in seconds.")
+            .with_boundaries(vec![
+                0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0,
+            ])
+            .build();
+
+        let busy_duration = meter
+            .f64_counter("http.server.busy.duration")
+            .with_unit("s")
+            .with_description(
+                "Cumulative seconds spent serving HTTP requests; \
+                 rate() over a window gives time-averaged request concurrency.",
+            )
+            .build();
+
+        let active_requests = meter
+            .i64_up_down_counter("http.server.active_requests")
+            .with_description("Number of HTTP server requests currently in flight.")
+            .build();
+
+        Self { request_duration, busy_duration, active_requests }
+    }
+}
+```
+
+- [ ] **Step 2: Implement `Observability::init`**
+
+Replace the body of `Observability::init` in `crates/service_utils/src/observability.rs`:
+
+```rust
+impl Observability {
+    pub fn init(cfg: ObservabilityConfig) -> Result<Self, ObservabilityError> {
+        use opentelemetry::KeyValue;
+        use opentelemetry_sdk::Resource;
+        use opentelemetry_sdk::metrics::SdkMeterProvider;
+
+        let registry = Arc::new(prometheus::Registry::new());
+
+        let exporter = opentelemetry_prometheus::exporter()
+            .with_registry((*registry).clone())
+            .build()
+            .map_err(|e| ObservabilityError::PrometheusInit(e.to_string()))?;
+
+        let mut resource_attrs = vec![
+            KeyValue::new("service.name", cfg.service_name.clone()),
+            KeyValue::new("service.version", cfg.service_version.clone()),
+            KeyValue::new("service.instance.id", cfg.instance_id.clone()),
+        ];
+        if let Some(env) = &cfg.deployment_environment {
+            resource_attrs.push(KeyValue::new("deployment.environment", env.clone()));
+        }
+
+        let mut builder = SdkMeterProvider::builder()
+            .with_reader(exporter)
+            .with_resource(Resource::new(resource_attrs));
+
+        if let Some(endpoint) = &cfg.otlp_endpoint {
+            builder = with_otlp_reader(builder, endpoint, cfg.collect_interval)?;
+        }
+
+        let provider = builder.build();
+        opentelemetry::global::set_meter_provider(provider.clone());
+        let meter = provider.meter("superposition");
+
+        Ok(Self { provider, registry, meter })
+    }
+}
+
+#[cfg(not(test))]
+fn with_otlp_reader(
+    builder: opentelemetry_sdk::metrics::MeterProviderBuilder,
+    endpoint: &str,
+    interval: std::time::Duration,
+) -> Result<opentelemetry_sdk::metrics::MeterProviderBuilder, ObservabilityError> {
+    use opentelemetry_otlp::{MetricExporter, WithExportConfig};
+    use opentelemetry_sdk::metrics::PeriodicReader;
+    use opentelemetry_sdk::runtime;
+
+    let exporter = MetricExporter::builder()
+        .with_http()
+        .with_endpoint(endpoint.to_owned())
+        .build()
+        .map_err(|e| ObservabilityError::OtlpInit(e.to_string()))?;
+
+    let reader = PeriodicReader::builder(exporter, runtime::Tokio)
+        .with_interval(interval)
+        .build();
+
+    Ok(builder.with_reader(reader))
+}
+
+#[cfg(test)]
+fn with_otlp_reader(
+    builder: opentelemetry_sdk::metrics::MeterProviderBuilder,
+    _endpoint: &str,
+    _interval: std::time::Duration,
+) -> Result<opentelemetry_sdk::metrics::MeterProviderBuilder, ObservabilityError> {
+    // OTLP exporter requires a tokio runtime; we don't spin one up in unit tests.
+    Ok(builder)
+}
+```
+
+- [ ] **Step 3: Add a smoke test**
+
+Append to `crates/service_utils/src/observability.rs`:
+
+```rust
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn test_cfg() -> ObservabilityConfig {
+        ObservabilityConfig {
+            enabled: true,
+            bind: "127.0.0.1".parse().unwrap(),
+            port: 0,
+            label: LabelConfig::default(),
+            collect_interval: std::time::Duration::from_secs(10),
+            instance_id: "test".into(),
+            service_name: "sp-test".into(),
+            service_version: "0.0.0-test".into(),
+            deployment_environment: None,
+            otlp_endpoint: None,
+        }
+    }
+
+    #[test]
+    fn init_builds_meter_and_registry() {
+        let obs = Observability::init(test_cfg()).expect("init failed");
+        let _meter = obs.meter();
+        let registry = obs.registry();
+        assert_eq!(registry.gather().len(), 0, "no metrics emitted yet");
+    }
+
+    #[test]
+    fn meter_can_record_a_histogram_and_register_it_in_registry() {
+        let obs = Observability::init(test_cfg()).unwrap();
+        let meter = obs.meter();
+        let h = meter.f64_histogram("test.duration").with_unit("s").build();
+        h.record(0.123, &[]);
+
+        let mut buf = Vec::new();
+        let encoder = prometheus::TextEncoder::new();
+        let metric_families = obs.registry().gather();
+        prometheus::Encoder::encode(&encoder, &metric_families, &mut buf).unwrap();
+        let text = String::from_utf8(buf).unwrap();
+        assert!(
+            text.contains("test_duration"),
+            "expected test_duration in exposition, got:\n{text}"
+        );
+    }
+}
+```
+
+- [ ] **Step 4: Run the tests**
+
+Run: `cargo test -p service_utils observability::tests`
+Expected: 2 tests pass.
+
+If you get a compile error about `with_boundaries` not existing, the OpenTelemetry SDK version you pinned uses the older `with_explicit_buckets` name. Adjust the call in `meters.rs` accordingly. Same for `MetricExporter::builder().with_http()` — older versions used `new_exporter().http()`.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add crates/service_utils/src/observability.rs crates/service_utils/src/observability/meters.rs
+git commit -m "feat(observability): MeterProvider with prometheus exporter
+
+Builds an SdkMeterProvider wired to an opentelemetry-prometheus
+exporter that writes into a per-process prometheus::Registry. OTLP
+push exporter is plumbed but only activates when
+OTEL_EXPORTER_OTLP_ENDPOINT is set.
+"
+```
+
+---
+
+## Task 8: Route template extraction helper
+
+**Files:**
+- Modify: `crates/service_utils/src/observability/middleware.rs`
+
+- [ ] **Step 1: Add helper + tests**
+
+Append to `crates/service_utils/src/observability/middleware.rs`:
+
+```rust
+use actix_web::dev::ServiceRequest;
+
+/// Sentinel for paths that did not match any registered route (would 404).
+pub(crate) const ROUTE_NOT_FOUND: &str = "__not_found__";
+
+/// Extracts the templated route pattern from a ServiceRequest. Falls back to
+/// a sentinel when no route matched, to keep `http.route` cardinality bounded.
+pub(crate) fn extract_route(req: &ServiceRequest) -> String {
+    req.match_pattern().unwrap_or_else(|| ROUTE_NOT_FOUND.to_owned())
+}
+```
+
+Add tests inside the existing `#[cfg(test)] mod tests` block:
+
+```rust
+    use actix_web::{App, HttpResponse, http::StatusCode, test, web};
+
+    #[actix_web::test]
+    async fn matched_route_returns_pattern() {
+        let app = test::init_service(
+            App::new().route(
+                "/contexts/{id}",
+                web::get().to(|| async { HttpResponse::Ok() }),
+            ),
+        )
+        .await;
+        let req = test::TestRequest::get().uri("/contexts/abc123").to_request();
+        let resp = test::call_service(&app, req).await;
+        assert_eq!(resp.status(), StatusCode::OK);
+        // Note: extract_route is exercised in the integration test in Task 15
+        // because match_pattern() is only populated mid-pipeline. This unit-test
+        // stub is kept for build-coverage of the call site.
+    }
+```
+
+- [ ] **Step 2: Run the tests**
+
+Run: `cargo test -p service_utils observability::middleware`
+Expected: previous 2 tests + 1 new still pass.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add crates/service_utils/src/observability/middleware.rs
+git commit -m "feat(observability): route template extraction helper"
+```
+
+---
+
+## Task 9: Label extraction with org/workspace from extensions
+
+**Files:**
+- Modify: `crates/service_utils/src/observability/middleware.rs`
+
+- [ ] **Step 1: Confirm extension types**
+
+Run: `grep -rn "OrgId\|WorkspaceName\|insert::<.*Workspace" /Users/natarajankannan/src/superposition/crates/service_utils/src/middlewares/ 2>/dev/null | head -10`
+Expected: shows the actual type names that `OrgWorkspaceMiddlewareFactory` inserts into request extensions. The likely candidates are `OrgId(String)` and `WorkspaceName(String)` or similar newtypes from `superposition_types`.
+
+If the grep shows different type names, use those in the code below in place of the placeholders.
+
+- [ ] **Step 2: Add label-build helper + tests**
+
+Append to `crates/service_utils/src/observability/middleware.rs`:
+
+```rust
+use opentelemetry::KeyValue;
+use crate::observability::config::LabelConfig;
+
+/// Build the OTel attributes set for a single HTTP request. Reads org_id /
+/// workspace_id from request extensions if `OrgWorkspaceMiddlewareFactory`
+/// has populated them; otherwise omits those attributes entirely (rather
+/// than emitting an empty string, which would create a distinct series).
+pub(crate) fn build_attributes(
+    method: &'static str,
+    route: &str,
+    status_code: u16,
+    org_id: Option<&str>,
+    workspace: Option<&str>,
+    label_cfg: &LabelConfig,
+) -> Vec<KeyValue> {
+    let mut attrs = Vec::with_capacity(5);
+    attrs.push(KeyValue::new("http.request.method", method));
+    attrs.push(KeyValue::new("http.route", route.to_owned()));
+    attrs.push(KeyValue::new("http.response.status_code", status_code as i64));
+    if label_cfg.with_org_label {
+        if let Some(o) = org_id {
+            attrs.push(KeyValue::new("sp.org_id", o.to_owned()));
+        }
+    }
+    if label_cfg.with_workspace_label {
+        if let Some(w) = workspace {
+            attrs.push(KeyValue::new("sp.workspace_id", w.to_owned()));
+        }
+    }
+    attrs
+}
+```
+
+Add to the test block:
+
+```rust
+    #[test]
+    fn build_attributes_with_all_labels() {
+        let cfg = LabelConfig { with_org_label: true, with_workspace_label: true };
+        let attrs = build_attributes("GET", "/contexts/{id}", 200, Some("org1"), Some("ws1"), &cfg);
+        assert_eq!(attrs.len(), 5);
+        assert!(attrs.iter().any(|kv| kv.key.as_str() == "sp.org_id"));
+        assert!(attrs.iter().any(|kv| kv.key.as_str() == "sp.workspace_id"));
+    }
+
+    #[test]
+    fn build_attributes_omits_missing_workspace() {
+        let cfg = LabelConfig { with_org_label: true, with_workspace_label: true };
+        let attrs = build_attributes("POST", "/orgs", 201, Some("org1"), None, &cfg);
+        assert_eq!(attrs.len(), 4);
+        assert!(attrs.iter().any(|kv| kv.key.as_str() == "sp.org_id"));
+        assert!(!attrs.iter().any(|kv| kv.key.as_str() == "sp.workspace_id"));
+    }
+
+    #[test]
+    fn build_attributes_respects_disable_flag() {
+        let cfg = LabelConfig { with_org_label: false, with_workspace_label: false };
+        let attrs = build_attributes("GET", "/x", 200, Some("org1"), Some("ws1"), &cfg);
+        assert_eq!(attrs.len(), 3);
+        assert!(!attrs.iter().any(|kv| kv.key.as_str() == "sp.org_id"));
+        assert!(!attrs.iter().any(|kv| kv.key.as_str() == "sp.workspace_id"));
+    }
+```
+
+- [ ] **Step 3: Run the tests**
+
+Run: `cargo test -p service_utils observability::middleware`
+Expected: all middleware tests pass.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add crates/service_utils/src/observability/middleware.rs
+git commit -m "feat(observability): build OTel attributes for HTTP metrics"
+```
+
+---
+
+## Task 10: `InFlightGuard` (panic-safe active-requests decrement)
+
+**Files:**
+- Modify: `crates/service_utils/src/observability/middleware.rs`
+
+- [ ] **Step 1: Add the guard + tests**
+
+Append to `middleware.rs`:
+
+```rust
+use std::sync::atomic::{AtomicBool, Ordering};
+use opentelemetry::metrics::UpDownCounter;
+
+/// RAII guard that decrements `http.server.active_requests` on Drop unless
+/// `release()` was called. Ensures a panicking handler still decrements the
+/// gauge.
+pub(crate) struct InFlightGuard {
+    counter: UpDownCounter<i64>,
+    method: &'static str,
+    decremented: AtomicBool,
+}
+
+impl InFlightGuard {
+    pub(crate) fn enter(counter: UpDownCounter<i64>, method: &'static str) -> Self {
+        counter.add(1, &[KeyValue::new("http.request.method", method)]);
+        Self {
+            counter,
+            method,
+            decremented: AtomicBool::new(false),
+        }
+    }
+
+    pub(crate) fn release(&self) {
+        if !self.decremented.swap(true, Ordering::Relaxed) {
+            self.counter.add(
+                -1,
+                &[KeyValue::new("http.request.method", self.method)],
+            );
+        }
+    }
+}
+
+impl Drop for InFlightGuard {
+    fn drop(&mut self) {
+        self.release();
+    }
+}
+```
+
+Add a test (this requires a real meter; we get one from `Observability::init`):
+
+```rust
+    #[test]
+    fn guard_decrements_on_drop_only_once() {
+        use crate::observability::{Observability, ObservabilityConfig, LabelConfig};
+        use std::time::Duration;
+
+        let cfg = ObservabilityConfig {
+            enabled: true,
+            bind: "127.0.0.1".parse().unwrap(),
+            port: 0,
+            label: LabelConfig::default(),
+            collect_interval: Duration::from_secs(10),
+            instance_id: "test".into(),
+            service_name: "sp-test".into(),
+            service_version: "0".into(),
+            deployment_environment: None,
+            otlp_endpoint: None,
+        };
+        let obs = Observability::init(cfg).unwrap();
+        let m = obs.meter().i64_up_down_counter("test.in_flight").build();
+
+        {
+            let g = InFlightGuard::enter(m.clone(), "GET");
+            g.release();
+            // Drop after explicit release; should be a no-op.
+        }
+        // Hard to introspect the counter value from outside, but we can call
+        // release multiple times and ensure no panic.
+        let g = InFlightGuard::enter(m.clone(), "POST");
+        g.release();
+        g.release();
+        drop(g);
+    }
+```
+
+- [ ] **Step 2: Run the tests**
+
+Run: `cargo test -p service_utils observability::middleware`
+Expected: all middleware tests pass.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add crates/service_utils/src/observability/middleware.rs
+git commit -m "feat(observability): RAII guard for active_requests gauge
+
+Drop-based decrement ensures the gauge stays balanced even when a
+handler panics or the future is cancelled.
+"
+```
+
+---
+
+## Task 11: Full `MetricsMiddleware` (Transform + Service)
+
+**Files:**
+- Modify: `crates/service_utils/src/observability/middleware.rs`
+
+- [ ] **Step 1: Implement the full middleware**
+
+Replace the placeholder `pub struct MetricsMiddleware;` with the full implementation. Append/replace at the bottom of `middleware.rs` (keep all helpers and tests above intact):
+
+```rust
+use std::future::{Ready, ready};
+use std::pin::Pin;
+use std::rc::Rc;
+use std::task::{Context, Poll};
+use std::time::Instant;
+
+use actix_web::{
+    Error, HttpMessage,
+    body::MessageBody,
+    dev::{Service, ServiceResponse, Transform, forward_ready},
+};
+use futures_util::future::LocalBoxFuture;
+use opentelemetry::metrics::Meter;
+
+use crate::observability::config::LabelConfig;
+use crate::observability::meters::HttpMeters;
+
+#[derive(Clone)]
+pub struct MetricsMiddleware {
+    meters: HttpMeters,
+    label_cfg: LabelConfig,
+}
+
+impl MetricsMiddleware {
+    pub fn new(meter: &Meter, label_cfg: LabelConfig) -> Self {
+        Self { meters: HttpMeters::new(meter), label_cfg }
+    }
+}
+
+impl<S, B> Transform<S, ServiceRequest> for MetricsMiddleware
+where
+    S: Service<ServiceRequest, Response = ServiceResponse<B>, Error = Error> + 'static,
+    S::Future: 'static,
+    B: MessageBody + 'static,
+{
+    type Response = ServiceResponse<B>;
+    type Error = Error;
+    type InitError = ();
+    type Transform = MetricsMiddlewareImpl<S>;
+    type Future = Ready<Result<Self::Transform, Self::InitError>>;
+
+    fn new_transform(&self, service: S) -> Self::Future {
+        ready(Ok(MetricsMiddlewareImpl {
+            service: Rc::new(service),
+            meters: self.meters.clone(),
+            label_cfg: self.label_cfg,
+        }))
+    }
+}
+
+pub struct MetricsMiddlewareImpl<S> {
+    service: Rc<S>,
+    meters: HttpMeters,
+    label_cfg: LabelConfig,
+}
+
+impl<S, B> Service<ServiceRequest> for MetricsMiddlewareImpl<S>
+where
+    S: Service<ServiceRequest, Response = ServiceResponse<B>, Error = Error> + 'static,
+    S::Future: 'static,
+    B: MessageBody + 'static,
+{
+    type Response = ServiceResponse<B>;
+    type Error = Error;
+    type Future = LocalBoxFuture<'static, Result<Self::Response, Self::Error>>;
+
+    forward_ready!(service);
+
+    fn call(&self, req: ServiceRequest) -> Self::Future {
+        let service = self.service.clone();
+        let meters = self.meters.clone();
+        let label_cfg = self.label_cfg;
+
+        let method_normalized = normalize_method(req.method());
+        let start = Instant::now();
+        let guard = InFlightGuard::enter(meters.active_requests.clone(), method_normalized);
+
+        Box::pin(async move {
+            let result = service.call(req).await;
+            let elapsed = start.elapsed().as_secs_f64();
+
+            match &result {
+                Ok(res) => {
+                    let route = extract_route_from_response(res);
+                    let status = res.status().as_u16();
+                    let org = res
+                        .request()
+                        .extensions()
+                        .get::<OrgIdExt>()
+                        .map(|o| o.0.clone());
+                    let ws = res
+                        .request()
+                        .extensions()
+                        .get::<WorkspaceNameExt>()
+                        .map(|w| w.0.clone());
+
+                    let attrs = build_attributes(
+                        method_normalized,
+                        &route,
+                        status,
+                        org.as_deref(),
+                        ws.as_deref(),
+                        &label_cfg,
+                    );
+                    meters.request_duration.record(elapsed, &attrs);
+                    meters.busy_duration.add(
+                        elapsed,
+                        &[KeyValue::new("http.request.method", method_normalized)],
+                    );
+                }
+                Err(_) => {
+                    // The error converts to a response upstream; record under 500
+                    // with `error.type=unhandled`. Route is unknown here.
+                    let attrs = build_attributes(
+                        method_normalized,
+                        ROUTE_NOT_FOUND,
+                        500,
+                        None,
+                        None,
+                        &label_cfg,
+                    );
+                    meters.request_duration.record(elapsed, &attrs);
+                }
+            }
+
+            guard.release();
+            result
+        })
+    }
+}
+
+fn extract_route_from_response<B>(res: &ServiceResponse<B>) -> String {
+    res.request()
+        .match_pattern()
+        .unwrap_or_else(|| ROUTE_NOT_FOUND.to_owned())
+}
+
+/// Newtype wrappers used to read org/workspace from request extensions.
+/// Replace these with the real types inserted by `OrgWorkspaceMiddlewareFactory`
+/// (verified in Task 9 Step 1) — typically something like
+/// `superposition_types::OrgId(pub String)` and `WorkspaceName(pub String)`.
+#[derive(Clone)]
+pub(crate) struct OrgIdExt(pub String);
+#[derive(Clone)]
+pub(crate) struct WorkspaceNameExt(pub String);
+```
+
+**Important — replace `OrgIdExt` / `WorkspaceNameExt` with the real extension types** that `OrgWorkspaceMiddlewareFactory` inserts (verified at Task 9 Step 1). If they live in `superposition_types`, just import them and use them directly. The newtype shims above only exist as a fallback to keep this task buildable in isolation.
+
+- [ ] **Step 2: Add an end-to-end test for the middleware**
+
+Append to the test module:
+
+```rust
+    use crate::observability::{Observability, ObservabilityConfig};
+    use actix_web::App;
+    use std::time::Duration;
+
+    fn obs_for_test() -> Observability {
+        let cfg = ObservabilityConfig {
+            enabled: true,
+            bind: "127.0.0.1".parse().unwrap(),
+            port: 0,
+            label: LabelConfig::default(),
+            collect_interval: Duration::from_secs(10),
+            instance_id: "test".into(),
+            service_name: "sp-test".into(),
+            service_version: "0".into(),
+            deployment_environment: None,
+            otlp_endpoint: None,
+        };
+        Observability::init(cfg).unwrap()
+    }
+
+    #[actix_web::test]
+    async fn middleware_records_request_duration() {
+        let obs = obs_for_test();
+        let mw = MetricsMiddleware::new(&obs.meter(), LabelConfig::default());
+        let app = test::init_service(
+            App::new().wrap(mw).route(
+                "/ping",
+                web::get().to(|| async { HttpResponse::Ok().body("pong") }),
+            ),
+        )
+        .await;
+
+        let req = test::TestRequest::get().uri("/ping").to_request();
+        let resp = test::call_service(&app, req).await;
+        assert_eq!(resp.status(), StatusCode::OK);
+
+        let mut buf = Vec::new();
+        let metric_families = obs.registry().gather();
+        prometheus::Encoder::encode(
+            &prometheus::TextEncoder::new(),
+            &metric_families,
+            &mut buf,
+        )
+        .unwrap();
+        let text = String::from_utf8(buf).unwrap();
+        assert!(text.contains("http_server_request_duration_seconds_count"), "{text}");
+        assert!(text.contains("http_server_busy_duration_seconds_total"), "{text}");
+        assert!(text.contains("http_server_active_requests"), "{text}");
+        assert!(text.contains("http_route=\"/ping\""), "{text}");
+    }
+```
+
+- [ ] **Step 3: Run the tests**
+
+Run: `cargo test -p service_utils observability::middleware`
+Expected: all middleware tests pass.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add crates/service_utils/src/observability/middleware.rs
+git commit -m "feat(observability): MetricsMiddleware records HTTP signals
+
+Wraps every request with timing + active_requests gauge + busy_duration
+counter. Uses match_pattern() to template routes, OrgWorkspaceMiddleware
+extensions for tenant labels, and an InFlightGuard for panic safety.
+"
+```
+
+---
+
+## Task 12: Health endpoints
+
+**Files:**
+- Modify: `crates/service_utils/src/observability/health.rs`
+
+- [ ] **Step 1: Replace stub with real handlers**
+
+```rust
+//! Health probe endpoints mounted on the main app port.
+//!
+//! Paths are added to `tenant_middleware_exclusion_list` so they bypass auth.
+
+use actix_web::{HttpResponse, Scope, web};
+
+pub const HEALTHZ: &str = "/healthz";
+pub const LIVEZ: &str = "/livez";
+pub const READYZ: &str = "/readyz";
+
+/// Returns the Actix scope to mount on the main app:
+/// `App::new().service(observability::health_endpoints())`.
+pub fn health_endpoints() -> Scope {
+    web::scope("")
+        .route(HEALTHZ, web::get().to(healthz))
+        .route(LIVEZ, web::get().to(livez))
+        .route(READYZ, web::get().to(readyz))
+}
+
+/// Paths to add to the auth exclusion list.
+pub fn health_endpoint_paths() -> &'static [&'static str] {
+    &[HEALTHZ, LIVEZ, READYZ]
+}
+
+async fn healthz() -> HttpResponse {
+    HttpResponse::Ok().content_type("text/plain; charset=utf-8").body("ok")
+}
+
+async fn livez() -> HttpResponse {
+    HttpResponse::Ok().content_type("text/plain; charset=utf-8").body("ok")
+}
+
+async fn readyz() -> HttpResponse {
+    // v1: same as livez. Future: check DB pool, Redis, dependencies.
+    HttpResponse::Ok().content_type("text/plain; charset=utf-8").body("ok")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use actix_web::{App, http::StatusCode, test};
+
+    #[actix_web::test]
+    async fn each_endpoint_returns_200_ok() {
+        let app = test::init_service(App::new().service(health_endpoints())).await;
+        for path in health_endpoint_paths() {
+            let req = test::TestRequest::get().uri(path).to_request();
+            let resp = test::call_service(&app, req).await;
+            assert_eq!(resp.status(), StatusCode::OK, "GET {path}");
+        }
+    }
+
+    #[test]
+    fn paths_list_matches_routes() {
+        let paths = health_endpoint_paths();
+        assert_eq!(paths, &[HEALTHZ, LIVEZ, READYZ]);
+    }
+}
+```
+
+- [ ] **Step 2: Run the tests**
+
+Run: `cargo test -p service_utils observability::health`
+Expected: 2 tests pass.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add crates/service_utils/src/observability/health.rs
+git commit -m "feat(observability): /healthz /livez /readyz handlers"
+```
+
+---
+
+## Task 13: Metrics server (separate `HttpServer` on `SUPERPOSITION_METRICS_PORT`)
+
+**Files:**
+- Modify: `crates/service_utils/src/observability/metrics_server.rs`
+
+- [ ] **Step 1: Replace stub with real implementation**
+
+```rust
+//! Separate HttpServer that exposes /metrics on SUPERPOSITION_METRICS_PORT.
+
+use std::{net::SocketAddr, sync::Arc};
+
+use actix_web::{App, HttpResponse, HttpServer, dev::Server, web};
+use prometheus::{Encoder, Registry, TextEncoder};
+
+/// Spawn an HttpServer on `bind` whose only route is `GET /metrics`. Returns
+/// the actix `Server` handle so the caller can `await` it concurrently with
+/// the main app.
+pub fn spawn_metrics_server(
+    registry: Arc<Registry>,
+    bind: SocketAddr,
+) -> std::io::Result<Server> {
+    let registry_data = web::Data::new(registry);
+    Ok(HttpServer::new(move || {
+        App::new()
+            .app_data(registry_data.clone())
+            .route("/metrics", web::get().to(scrape))
+    })
+    .workers(1)
+    .bind(bind)?
+    .run())
+}
+
+async fn scrape(registry: web::Data<Arc<Registry>>) -> HttpResponse {
+    let encoder = TextEncoder::new();
+    let metric_families = registry.gather();
+    let mut buf = Vec::new();
+    if let Err(e) = encoder.encode(&metric_families, &mut buf) {
+        return HttpResponse::InternalServerError()
+            .body(format!("encode error: {e}"));
+    }
+    HttpResponse::Ok()
+        .content_type(encoder.format_type())
+        .body(buf)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use actix_web::{App, http::StatusCode, test};
+
+    #[actix_web::test]
+    async fn scrape_endpoint_returns_text_plain() {
+        let registry = Arc::new(Registry::new());
+        let app = test::init_service(
+            App::new()
+                .app_data(web::Data::new(registry.clone()))
+                .route("/metrics", web::get().to(scrape)),
+        )
+        .await;
+        let req = test::TestRequest::get().uri("/metrics").to_request();
+        let resp = test::call_service(&app, req).await;
+        assert_eq!(resp.status(), StatusCode::OK);
+        let ct = resp
+            .headers()
+            .get("content-type")
+            .unwrap()
+            .to_str()
+            .unwrap();
+        assert!(ct.starts_with("text/plain"), "got {ct}");
+    }
+}
+```
+
+- [ ] **Step 2: Run the tests**
+
+Run: `cargo test -p service_utils observability::metrics_server`
+Expected: 1 test passes.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add crates/service_utils/src/observability/metrics_server.rs
+git commit -m "feat(observability): /metrics server on dedicated port"
+```
+
+---
+
+## Task 14: DB pool saturation (r2d2 ObservableGauge callbacks)
+
+**Files:**
+- Create: `crates/service_utils/src/observability/saturation/db_pool.rs`
+- Modify: `crates/service_utils/src/observability/saturation.rs`
+
+- [ ] **Step 1: Confirm pool type**
+
+Run: `grep -rn "type DbPool\|r2d2::Pool<\|PgPool" /Users/natarajankannan/src/superposition/crates/service_utils/src/db/ /Users/natarajankannan/src/superposition/crates/service_utils/src/service/ 2>/dev/null | head -10`
+Expected: shows the concrete `r2d2::Pool<…>` alias used across the codebase. The `DbPoolHandle` type alias below should be set to that exact type (commonly `r2d2::Pool<diesel::r2d2::ConnectionManager<diesel::PgConnection>>`).
+
+- [ ] **Step 2: Create the db_pool module**
+
+```rust
+//! ObservableGauge callbacks for the r2d2 connection pool. Purely passive —
+//! no instrumentation at `pool.get()` call sites.
+
+use opentelemetry::{KeyValue, metrics::Meter};
+
+/// Concrete pool type used across the codebase. Update if it differs.
+pub type DbPoolHandle = std::sync::Arc<
+    r2d2::Pool<diesel::r2d2::ConnectionManager<diesel::PgConnection>>,
+>;
+
+pub fn register(meter: &Meter, pool: DbPoolHandle, pool_name: &'static str) {
+    let pool_for_usage = pool.clone();
+    let usage_pool_name = KeyValue::new("pool.name", pool_name);
+    meter
+        .u64_observable_gauge("db.client.connections.usage")
+        .with_description("Number of DB connections in idle/used state.")
+        .with_callback(move |observer| {
+            let s = pool_for_usage.state();
+            let used = s.connections.saturating_sub(s.idle_connections);
+            observer.observe(
+                s.idle_connections as u64,
+                &[
+                    KeyValue::new("state", "idle"),
+                    usage_pool_name.clone(),
+                ],
+            );
+            observer.observe(
+                used as u64,
+                &[
+                    KeyValue::new("state", "used"),
+                    usage_pool_name.clone(),
+                ],
+            );
+        })
+        .build();
+
+    let pool_for_max = pool.clone();
+    let max_pool_name = KeyValue::new("pool.name", pool_name);
+    meter
+        .u64_observable_gauge("db.client.connections.max")
+        .with_description("Configured maximum size of the DB connection pool.")
+        .with_callback(move |observer| {
+            observer.observe(pool_for_max.max_size() as u64, &[max_pool_name.clone()]);
+        })
+        .build();
+}
+```
+
+- [ ] **Step 3: Wire it via the saturation entry point**
+
+Replace `crates/service_utils/src/observability/saturation.rs`:
+
+```rust
+//! Saturation collectors: DB pool, Redis pool, Tokio runtime.
+//!
+//! Most metrics are observable-gauge callbacks (no background tasks).
+//! Only `tokio_runtime` requires a polling loop.
+
+mod db_pool;
+mod redis_pool;
+mod tokio_runtime;
+
+use opentelemetry::metrics::Meter;
+
+pub use db_pool::DbPoolHandle;
+
+/// Optional dependencies the saturation subsystem can observe.
+#[derive(Default, Clone)]
+pub struct SaturationDeps {
+    pub db_pool: Option<DbPoolHandle>,
+    pub redis_client: Option<redis_pool::RedisHandle>,
+    pub tokio_collect_interval: std::time::Duration,
+}
+
+pub fn register_observers(
+    meter: &Meter,
+    deps: SaturationDeps,
+) -> Result<(), super::ObservabilityError> {
+    if let Some(pool) = deps.db_pool {
+        db_pool::register(meter, pool, "primary");
+    }
+    if let Some(client) = deps.redis_client {
+        redis_pool::register(meter, client, "primary");
+    }
+
+    #[cfg(tokio_unstable)]
+    if deps.tokio_collect_interval > std::time::Duration::ZERO {
+        tokio_runtime::spawn(meter, deps.tokio_collect_interval);
+    }
+
+    Ok(())
+}
+```
+
+- [ ] **Step 4: Verify it compiles**
+
+Run: `cargo check -p service_utils`
+Expected: exit code 0. Failures here are usually:
+- The `DbPoolHandle` alias doesn't match the codebase's actual pool type → adjust to whatever Step 1 found.
+- `redis_pool` module doesn't exist yet → that's Task 15. Create an empty stub now: `crates/service_utils/src/observability/saturation/redis_pool.rs` with:
+  ```rust
+  //! Stub — implemented in Task 15.
+  use opentelemetry::metrics::Meter;
+  pub type RedisHandle = std::sync::Arc<()>;
+  pub fn register(_meter: &Meter, _client: RedisHandle, _pool_name: &'static str) {}
+  ```
+- Same for `tokio_runtime` (Task 16). Stub: `crates/service_utils/src/observability/saturation/tokio_runtime.rs` with:
+  ```rust
+  //! Stub — implemented in Task 16.
+  #[cfg(tokio_unstable)]
+  pub fn spawn(_meter: &opentelemetry::metrics::Meter, _interval: std::time::Duration) {}
+  ```
+
+- [ ] **Step 5: Add a smoke test**
+
+Append to `db_pool.rs`:
+
+```rust
+#[cfg(test)]
+mod tests {
+    // Constructing a real r2d2 pool requires a database. We assert the function
+    // signature compiles and that calling `register` does not panic with a
+    // synthetic in-memory pool; this is exercised via the integration test in
+    // Task 18 instead.
+}
+```
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add crates/service_utils/src/observability/saturation.rs crates/service_utils/src/observability/saturation/
+git commit -m "feat(observability): db pool saturation gauges
+
+ObservableGauge callbacks read r2d2::Pool::state() at scrape time.
+Emits db.client.connections.usage{state} and db.client.connections.max.
+"
+```
+
+---
+
+## Task 15: Redis pool saturation (fred metrics)
+
+**Files:**
+- Modify: `crates/service_utils/src/observability/saturation/redis_pool.rs`
+
+- [ ] **Step 1: Find fred client type and metrics surface**
+
+Run: `grep -rn "fred::\|RedisClient\|Pool<RedisClient>" /Users/natarajankannan/src/superposition/crates/service_utils/src/redis* 2>/dev/null | head -10`
+Expected: shows the concrete fred client type (likely `fred::clients::RedisClient` or `fred::clients::RedisPool`).
+
+Also run: `cargo doc -p fred --no-deps --open` *(or browse https://docs.rs/fred/latest/fred/)* and locate the metrics API. fred 9.x exposes per-client `read_latency_metrics()` / `write_latency_metrics()` and connection counters via `Server` / `Stats` types. Pin the names you actually find.
+
+- [ ] **Step 2: Replace stub**
+
+```rust
+//! Saturation gauges for the Redis client pool (fred crate).
+//!
+//! fred's `metrics` feature exposes per-client / per-pool stats. The
+//! callbacks below are intentionally tolerant: if a stat is unavailable
+//! in the version we use, the metric is simply not emitted (a TODO is
+//! left at the call site).
+
+use std::sync::Arc;
+
+use opentelemetry::{KeyValue, metrics::Meter};
+
+/// Wraps whatever fred client/pool type the rest of `service_utils` uses.
+/// Update the inner type to match `crate::redis`'s public surface.
+pub type RedisHandle = Arc<dyn RedisStats + Send + Sync>;
+
+/// Tiny abstraction so the metrics module doesn't have to know fred's
+/// concrete types. Implement on the wrapper that `crate::redis` already
+/// hands around.
+pub trait RedisStats {
+    fn idle_connections(&self) -> Option<u64>;
+    fn used_connections(&self) -> Option<u64>;
+    fn commands_in_flight(&self) -> Option<u64>;
+}
+
+pub fn register(meter: &Meter, client: RedisHandle, pool_name: &'static str) {
+    let usage_label = KeyValue::new("pool.name", pool_name);
+
+    let c = client.clone();
+    let label = usage_label.clone();
+    meter
+        .u64_observable_gauge("redis.client.connections.usage")
+        .with_description("Number of Redis connections in idle/used state.")
+        .with_callback(move |observer| {
+            if let Some(idle) = c.idle_connections() {
+                observer.observe(
+                    idle,
+                    &[KeyValue::new("state", "idle"), label.clone()],
+                );
+            }
+            if let Some(used) = c.used_connections() {
+                observer.observe(
+                    used,
+                    &[KeyValue::new("state", "used"), label.clone()],
+                );
+            }
+        })
+        .build();
+
+    let c = client.clone();
+    let label = usage_label.clone();
+    meter
+        .u64_observable_gauge("redis.client.commands.in_flight")
+        .with_description("Number of Redis commands currently in flight.")
+        .with_callback(move |observer| {
+            if let Some(n) = c.commands_in_flight() {
+                observer.observe(n, &[label.clone()]);
+            }
+        })
+        .build();
+}
+```
+
+- [ ] **Step 3: Wire `RedisStats` to whatever fred client you use**
+
+Find the wrapper in `crate::redis` (the internal Redis surface), and `impl RedisStats for YourWrapper` using the fred metrics surface confirmed in Step 1. If a particular field is unavailable in your fred version, leave the impl returning `None` and a `// TODO(observability): expose <field> when fred …` comment.
+
+This step is intentionally light on prescriptive code because the exact fred API depends on the pinned version. The contract is just three `Option<u64>` getters; all three returning `None` is acceptable for v1 — the metrics simply won't have data.
+
+- [ ] **Step 4: Verify it compiles**
+
+Run: `cargo check -p service_utils`
+Expected: exit code 0.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add crates/service_utils/src/observability/saturation/redis_pool.rs crates/service_utils/src/redis*
+git commit -m "feat(observability): redis pool saturation gauges
+
+ObservableGauge callbacks read fred client/pool stats via a thin
+RedisStats trait. Tolerant to missing fields — a None return simply
+omits the metric.
+"
+```
+
+---
+
+## Task 16: Tokio runtime saturation (`cfg(tokio_unstable)`)
+
+**Files:**
+- Modify: `crates/service_utils/src/observability/saturation/tokio_runtime.rs`
+
+- [ ] **Step 1: Replace stub**
+
+```rust
+//! Tokio runtime saturation, gated on `cfg(tokio_unstable)`.
+//!
+//! Unlike DB/Redis, `tokio_metrics::RuntimeMonitor` is delta-based: each
+//! `.intervals()` call returns stats since the last call. So we run a
+//! background task that samples every `interval` and stores derived values
+//! in atomics that observable-gauge callbacks read.
+
+#[cfg(not(tokio_unstable))]
+pub fn spawn(_meter: &opentelemetry::metrics::Meter, _interval: std::time::Duration) {}
+
+#[cfg(tokio_unstable)]
+mod inner {
+    use std::sync::Arc;
+    use std::sync::atomic::{AtomicU64, Ordering};
+    use std::time::Duration;
+
+    use opentelemetry::metrics::Meter;
+    use tokio_metrics::RuntimeMonitor;
+
+    #[derive(Default)]
+    struct Snapshot {
+        workers: AtomicU64,
+        global_queue_depth: AtomicU64,
+        busy_ratio_milli: AtomicU64, // busy_ratio * 1000, stored as integer
+    }
+
+    pub fn spawn(meter: &Meter, interval: Duration) {
+        let handle = match tokio::runtime::Handle::try_current() {
+            Ok(h) => h,
+            Err(_) => return, // not running on a tokio runtime; no-op
+        };
+        let snap = Arc::new(Snapshot::default());
+
+        // Background sampler.
+        let snap_for_task = snap.clone();
+        tokio::spawn(async move {
+            let monitor = RuntimeMonitor::new(&handle);
+            let mut intervals = monitor.intervals();
+            loop {
+                if let Some(m) = intervals.next() {
+                    snap_for_task
+                        .workers
+                        .store(m.workers_count as u64, Ordering::Relaxed);
+                    snap_for_task
+                        .global_queue_depth
+                        .store(m.global_queue_depth as u64, Ordering::Relaxed);
+                    let busy = m.total_busy_duration.as_secs_f64();
+                    let total = (m.total_polls_count as f64).max(1.0)
+                        * interval.as_secs_f64()
+                        * (m.workers_count as f64).max(1.0);
+                    let ratio = (busy / total).clamp(0.0, 1.0);
+                    snap_for_task
+                        .busy_ratio_milli
+                        .store((ratio * 1000.0) as u64, Ordering::Relaxed);
+                }
+                tokio::time::sleep(interval).await;
+            }
+        });
+
+        // ObservableGauges read from the snapshot atomics.
+        let s = snap.clone();
+        meter
+            .u64_observable_gauge("runtime.tokio.workers")
+            .with_callback(move |observer| {
+                observer.observe(s.workers.load(Ordering::Relaxed), &[]);
+            })
+            .build();
+
+        let s = snap.clone();
+        meter
+            .u64_observable_gauge("runtime.tokio.global_queue.depth")
+            .with_callback(move |observer| {
+                observer.observe(s.global_queue_depth.load(Ordering::Relaxed), &[]);
+            })
+            .build();
+
+        let s = snap.clone();
+        meter
+            .f64_observable_gauge("runtime.tokio.workers.busy_ratio")
+            .with_callback(move |observer| {
+                let milli = s.busy_ratio_milli.load(Ordering::Relaxed);
+                observer.observe(milli as f64 / 1000.0, &[]);
+            })
+            .build();
+    }
+}
+
+#[cfg(tokio_unstable)]
+pub use inner::spawn;
+```
+
+- [ ] **Step 2: Verify it compiles both with and without the cfg**
+
+Run: `cargo check -p service_utils`
+Expected: exit code 0 (the workspace `.cargo/config.toml` enables `tokio_unstable`, so the `inner` branch compiles).
+
+Run: `RUSTFLAGS="" cargo check -p service_utils`
+Expected: exit code 0 (the `not(tokio_unstable)` no-op stub compiles when the flag is off).
+
+If `tokio_metrics::RuntimeMonitor::intervals()` field names differ from those used above (`workers_count`, `global_queue_depth`, `total_busy_duration`, `total_polls_count`), adjust to match. The fundamental shape — `.intervals()` returning a delta iterator — is stable across recent versions.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add crates/service_utils/src/observability/saturation/tokio_runtime.rs
+git commit -m "feat(observability): tokio runtime saturation gauges
+
+Background sampler updates atomic snapshots; observable gauges read
+from the snapshot. Gated on cfg(tokio_unstable); compiles to a no-op
+when the flag is disabled.
+"
+```
+
+---
+
+## Task 17: Add health paths to `tenant_middleware_exclusion_list`
+
+**Files:**
+- Modify: `crates/superposition/src/app_state.rs`
+
+- [ ] **Step 1: Make health paths always-excluded**
+
+Replace lines 101–107 of `crates/superposition/src/app_state.rs` (the `tenant_middleware_exclusion_list` field assignment). Use `Read` first to confirm the surrounding context, then `Edit`:
+
+```rust
+        tenant_middleware_exclusion_list: {
+            let mut set = get_from_env_unsafe::<String>(
+                "TENANT_MIDDLEWARE_EXCLUSION_LIST",
+            )
+            .expect("TENANT_MIDDLEWARE_EXCLUSION_LIST is not set")
+            .split(',')
+            .map(String::from)
+            .collect::<HashSet<_>>();
+            // Always exclude observability health endpoints from auth checks.
+            set.extend(
+                service_utils::observability::health_endpoint_paths()
+                    .iter()
+                    .map(|s| s.to_string()),
+            );
+            set
+        },
+```
+
+- [ ] **Step 2: Verify it compiles**
+
+Run: `cargo check -p superposition`
+Expected: exit code 0.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add crates/superposition/src/app_state.rs
+git commit -m "feat: exclude /healthz /livez /readyz from auth checks
+
+Adds the observability health paths to tenant_middleware_exclusion_list
+so probes do not trigger auth flow. Operators no longer need to remember
+to put these in TENANT_MIDDLEWARE_EXCLUSION_LIST.
+"
+```
+
+---
+
+## Task 18: Wire observability into `main.rs`
+
+**Files:**
+- Modify: `crates/superposition/src/main.rs`
+
+- [ ] **Step 1: Add imports + early init**
+
+At the top of `main()` in `crates/superposition/src/main.rs` (after tracing init, before app_state construction), add:
+
+```rust
+    use service_utils::observability::{
+        self, MetricsMiddleware, Observability, ObservabilityConfig, SaturationDeps,
+    };
+
+    let obs_cfg = ObservabilityConfig::from_env()
+        .expect("invalid observability env config");
+    let obs_enabled = obs_cfg.enabled;
+    let observability = if obs_enabled {
+        Some(Observability::init(obs_cfg.clone()).expect("observability init failed"))
+    } else {
+        None
+    };
+    let metrics_meter = observability.as_ref().map(|o| o.meter());
+    let metrics_label_cfg = obs_cfg.label;
+```
+
+- [ ] **Step 2: Register saturation observers**
+
+After `app_state` is built (so the DB pool is available) and inside the tokio runtime context, add:
+
+```rust
+    if let (Some(obs), Some(pool)) = (observability.as_ref(), Some(app_state.db_pool.clone())) {
+        observability::register_observers(
+            &obs.meter(),
+            SaturationDeps {
+                db_pool: Some(pool),
+                redis_client: app_state.redis_client.clone().map(Into::into),
+                tokio_collect_interval: obs_cfg.collect_interval,
+            },
+        )
+        .expect("saturation observer registration failed");
+    }
+```
+
+(Adjust `app_state.db_pool` and `app_state.redis_client` to whatever fields actually exist on `AppState`. If the redis client isn't easily Arc-wrapped, pass `None` and leave the metric unattached for now.)
+
+- [ ] **Step 3: Spawn the metrics server**
+
+After the saturation registration, add:
+
+```rust
+    let metrics_server_handle = if let Some(obs) = observability.as_ref() {
+        let bind: std::net::SocketAddr = format!("{}:{}", obs_cfg.bind, obs_cfg.port)
+            .parse()
+            .expect("invalid metrics bind addr");
+        Some(observability::spawn_metrics_server(obs.registry(), bind)?)
+    } else {
+        None
+    };
+```
+
+- [ ] **Step 4: Add the middleware to the App builder**
+
+In the `HttpServer::new(move || App::new()…)` closure, add the middleware as the *outermost* `.wrap()` (i.e., the *last* `.wrap()` in the chain — Actix runs the last-wrapped middleware first):
+
+```rust
+            .service(observability::health_endpoints())
+            // ... existing .service() and .wrap() calls (auth_z, auth_n, ...
+            //     RequestResponseLogger, TracingLogger) ...
+            .wrap(actix_web::middleware::Condition::new(
+                obs_enabled,
+                metrics_meter
+                    .as_ref()
+                    .map(|m| MetricsMiddleware::new(m, metrics_label_cfg))
+                    .unwrap_or_else(|| MetricsMiddleware::new(
+                        // construct a no-op meter for the disabled case
+                        &opentelemetry::global::meter("noop"),
+                        metrics_label_cfg,
+                    )),
+            ))
+            .wrap(TracingLogger::<CustomRootSpanBuilder>::new())
+```
+
+The `Condition` wrapper makes the middleware a no-op when `SUPERPOSITION_METRICS_ENABLED=false`. Match the `&meter` borrow shape that `MetricsMiddleware::new` expects.
+
+- [ ] **Step 5: Run both servers concurrently**
+
+Replace the final `.run().await` with a `try_join!` over the main and metrics servers. Example:
+
+```rust
+    let main_server = HttpServer::new(/* ... */)
+        .bind(("0.0.0.0", cac_port))?
+        .workers(get_from_env_or_default("ACTIX_WORKER_COUNT", 5))
+        .keep_alive(Duration::from_secs(
+            get_from_env_unsafe("ACTIX_KEEP_ALIVE").unwrap_or(120),
+        ))
+        .run();
+
+    match metrics_server_handle {
+        Some(metrics) => {
+            futures_util::try_join!(main_server, metrics)?;
+        }
+        None => {
+            main_server.await?;
+        }
+    }
+```
+
+(`futures_util` is already a workspace dep — see root `Cargo.toml`.)
+
+- [ ] **Step 6: Build the binary**
+
+Run: `cargo build -p superposition`
+Expected: exit code 0. Compilation errors here are the hardest part of the wiring; iterate on imports and types until clean.
+
+- [ ] **Step 7: Smoke-test locally**
+
+Start the binary against the local docker-compose dev stack:
+
+```bash
+make run    # or whatever the makefile target is
+```
+
+In another shell:
+
+```bash
+curl -s -i http://localhost:8080/healthz
+curl -s http://localhost:9091/metrics | head -50
+```
+
+Expected:
+
+- `/healthz` returns `200 OK` with body `ok`.
+- `/metrics` returns Prometheus exposition that includes lines starting with `# HELP http_server_request_duration_seconds`, `http_server_active_requests`, `http_server_busy_duration_seconds_total`, and (after issuing a few API requests) `http_server_request_duration_seconds_bucket{...}` lines with `http_route` labels.
+
+Stop the binary.
+
+- [ ] **Step 8: Commit**
+
+```bash
+git add crates/superposition/src/main.rs
+git commit -m "feat: wire observability into main binary
+
+- Init Observability early (Prometheus exporter + optional OTLP push)
+- Spawn metrics server on SUPERPOSITION_METRICS_PORT
+- Register DB/Redis/Tokio saturation observers
+- Wrap App with MetricsMiddleware (gated by SUPERPOSITION_METRICS_ENABLED)
+- Mount /healthz /livez /readyz on the main app
+- try_join! both servers so the process exits if either dies
+"
+```
+
+---
+
+## Task 19: Integration test — full pipeline through `/metrics`
+
+**Files:**
+- Create: `crates/service_utils/tests/observability_integration.rs`
+
+- [ ] **Step 1: Write the test**
+
+```rust
+//! End-to-end test: an Actix app wrapped with MetricsMiddleware serves several
+//! routes; we then issue requests and parse the Prometheus scrape output to
+//! assert on the metrics that should appear.
+
+use actix_web::{App, HttpResponse, http::StatusCode, test, web};
+use prometheus::Encoder;
+use service_utils::observability::{
+    LabelConfig, MetricsMiddleware, Observability, ObservabilityConfig,
+};
+
+fn cfg() -> ObservabilityConfig {
+    ObservabilityConfig {
+        enabled: true,
+        bind: "127.0.0.1".parse().unwrap(),
+        port: 0,
+        label: LabelConfig::default(),
+        collect_interval: std::time::Duration::from_secs(10),
+        instance_id: "it".into(),
+        service_name: "sp-it".into(),
+        service_version: "0".into(),
+        deployment_environment: None,
+        otlp_endpoint: None,
+    }
+}
+
+fn scrape(obs: &Observability) -> String {
+    let metric_families = obs.registry().gather();
+    let mut buf = Vec::new();
+    prometheus::TextEncoder::new()
+        .encode(&metric_families, &mut buf)
+        .unwrap();
+    String::from_utf8(buf).unwrap()
+}
+
+#[actix_web::test]
+async fn metrics_appear_after_requests() {
+    let obs = Observability::init(cfg()).unwrap();
+    let mw = MetricsMiddleware::new(&obs.meter(), LabelConfig::default());
+    let app = test::init_service(
+        App::new()
+            .wrap(mw)
+            .route("/ping", web::get().to(|| async { HttpResponse::Ok() }))
+            .route(
+                "/echo/{name}",
+                web::post().to(|p: web::Path<String>| async move {
+                    HttpResponse::Created().body(p.into_inner())
+                }),
+            )
+            .route(
+                "/boom",
+                web::get().to(|| async { HttpResponse::InternalServerError() }),
+            ),
+    )
+    .await;
+
+    for _ in 0..3 {
+        let req = test::TestRequest::get().uri("/ping").to_request();
+        let resp = test::call_service(&app, req).await;
+        assert_eq!(resp.status(), StatusCode::OK);
+    }
+    let req = test::TestRequest::post().uri("/echo/world").to_request();
+    let resp = test::call_service(&app, req).await;
+    assert_eq!(resp.status(), StatusCode::CREATED);
+
+    let req = test::TestRequest::get().uri("/boom").to_request();
+    let resp = test::call_service(&app, req).await;
+    assert_eq!(resp.status(), StatusCode::INTERNAL_SERVER_ERROR);
+
+    let req = test::TestRequest::get().uri("/no-such-route").to_request();
+    let resp = test::call_service(&app, req).await;
+    assert_eq!(resp.status(), StatusCode::NOT_FOUND);
+
+    let body = scrape(&obs);
+
+    // Request duration histogram exists with expected labels for /ping (3 hits).
+    let ping_count_line = body
+        .lines()
+        .find(|l| {
+            l.starts_with("http_server_request_duration_seconds_count{")
+                && l.contains("http_route=\"/ping\"")
+                && l.contains("http_request_method=\"GET\"")
+                && l.contains("http_response_status_code=\"200\"")
+        })
+        .unwrap_or_else(|| panic!("no /ping count line in:\n{body}"));
+    let ping_count: f64 = ping_count_line
+        .rsplit_once(' ')
+        .unwrap()
+        .1
+        .trim()
+        .parse()
+        .unwrap();
+    assert_eq!(ping_count as u64, 3);
+
+    // 5xx series for /boom appears.
+    assert!(
+        body.lines().any(|l| {
+            l.starts_with("http_server_request_duration_seconds_count{")
+                && l.contains("http_route=\"/boom\"")
+                && l.contains("http_response_status_code=\"500\"")
+        }),
+        "no /boom 500 series in:\n{body}"
+    );
+
+    // Unmatched path uses the sentinel.
+    assert!(
+        body.lines().any(|l| {
+            l.starts_with("http_server_request_duration_seconds_count{")
+                && l.contains("http_route=\"__not_found__\"")
+        }),
+        "no __not_found__ series in:\n{body}"
+    );
+
+    // busy_duration_total > 0
+    let busy = body
+        .lines()
+        .find(|l| l.starts_with("http_server_busy_duration_seconds_total{"))
+        .unwrap_or_else(|| panic!("no busy_duration line in:\n{body}"));
+    let busy_value: f64 = busy.rsplit_once(' ').unwrap().1.trim().parse().unwrap();
+    assert!(busy_value > 0.0, "expected busy_duration > 0, got {busy_value}");
+
+    // active_requests returns to 0 after all requests complete.
+    let active_lines: Vec<_> = body
+        .lines()
+        .filter(|l| l.starts_with("http_server_active_requests{"))
+        .collect();
+    for line in &active_lines {
+        let v: f64 = line.rsplit_once(' ').unwrap().1.trim().parse().unwrap();
+        assert_eq!(v, 0.0, "active_requests not zero: {line}");
+    }
+}
+```
+
+- [ ] **Step 2: Run the test**
+
+Run: `cargo test -p service_utils --test observability_integration`
+Expected: 1 test passes.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add crates/service_utils/tests/observability_integration.rs
+git commit -m "test(observability): end-to-end integration
+
+Wraps a small App with MetricsMiddleware, issues requests of various
+shapes (200, 201, 500, 404), and asserts on the parsed Prometheus
+exposition: per-route counts, the 5xx series, the __not_found__
+sentinel, busy_duration > 0, and active_requests returning to 0.
+"
+```
+
+---
+
+## Task 20: Cardinality regression test
+
+**Files:**
+- Modify: `crates/service_utils/tests/observability_integration.rs`
+
+- [ ] **Step 1: Add the test**
+
+Append to the integration test file:
+
+```rust
+#[actix_web::test]
+async fn cardinality_stays_within_budget() {
+    let obs = Observability::init(cfg()).unwrap();
+    let mw = MetricsMiddleware::new(&obs.meter(), LabelConfig::default());
+    let app = test::init_service(
+        App::new()
+            .wrap(mw)
+            .route("/a", web::get().to(|| async { HttpResponse::Ok() }))
+            .route("/b", web::get().to(|| async { HttpResponse::Ok() }))
+            .route("/c", web::post().to(|| async { HttpResponse::Created() })),
+    )
+    .await;
+
+    for _ in 0..10 {
+        for path in &["/a", "/b"] {
+            let req = test::TestRequest::get().uri(path).to_request();
+            let _ = test::call_service(&app, req).await;
+        }
+        let req = test::TestRequest::post().uri("/c").to_request();
+        let _ = test::call_service(&app, req).await;
+    }
+
+    let body = scrape(&obs);
+    let series = body
+        .lines()
+        .filter(|l| !l.is_empty() && !l.starts_with('#'))
+        .count();
+
+    // Budget for this scenario: 3 routes × 1 method each × 1 status × ~12
+    // (10 buckets + sum + count) = ~36 series for the histogram, plus 3 for
+    // busy_duration, plus 1 for active_requests, plus a few from `target_info`
+    // that the prometheus exporter emits. Headroom: 200.
+    assert!(series <= 200, "cardinality regression: {series} series\n{body}");
+}
+```
+
+- [ ] **Step 2: Run the test**
+
+Run: `cargo test -p service_utils --test observability_integration`
+Expected: 2 tests pass.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add crates/service_utils/tests/observability_integration.rs
+git commit -m "test(observability): cardinality regression budget
+
+Asserts that a 3-route × 1-method × 1-status scenario produces no more
+than 200 series, catching accidental high-cardinality labels in review.
+"
+```
+
+---
+
+## Task 21: Update README + makefile note
+
+**Files:**
+- Modify: `README.md`
+- Modify: `makefile`
+
+- [ ] **Step 1: Add a section to README**
+
+Find the build/development section and append:
+
+````markdown
+### Metrics & observability
+
+The HTTP API exposes Prometheus metrics on `SUPERPOSITION_METRICS_PORT` (default `9091`):
+
+```
+curl http://localhost:9091/metrics
+```
+
+Health endpoints live on the main port: `GET /healthz`, `/livez`, `/readyz`.
+
+For full details (labels, cardinality, OTLP push), see
+[`docs/superpowers/specs/2026-05-10-otel-golden-signals-middleware-design.md`](docs/superpowers/specs/2026-05-10-otel-golden-signals-middleware-design.md).
+
+**Note on `tokio_unstable`.** The workspace's `.cargo/config.toml` enables
+`--cfg tokio_unstable` so `tokio-metrics` can collect runtime saturation. This
+flag only adds APIs; no behavioural change for existing code. Contributors who
+build outside `cargo` (e.g., custom IDE invocations) should pass the same flag,
+or accept that the `runtime.tokio.*` metrics will be absent.
+````
+
+- [ ] **Step 2: Sanity-check the makefile**
+
+Read the makefile's `build`/`run` targets. If they invoke `cargo` plainly, no change is needed (the `.cargo/config.toml` is picked up automatically). If they set `RUSTFLAGS=` explicitly anywhere, ensure `--cfg tokio_unstable` is preserved.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add README.md makefile
+git commit -m "docs: note metrics endpoints and tokio_unstable build flag"
+```
+
+---
+
+## Task 22: Final smoke-test pass
+
+**Files:** none (verification only)
+
+- [ ] **Step 1: Full test suite**
+
+Run: `cargo test --workspace`
+Expected: all tests pass (including the integration test from Task 19).
+
+- [ ] **Step 2: Build with metrics disabled**
+
+Run:
+
+```bash
+SUPERPOSITION_METRICS_ENABLED=false cargo build -p superposition
+```
+
+Expected: builds cleanly.
+
+- [ ] **Step 3: Build without `tokio_unstable`**
+
+Run:
+
+```bash
+RUSTFLAGS="" cargo check -p service_utils
+```
+
+Expected: builds cleanly (the `not(tokio_unstable)` no-op stub is exercised).
+
+- [ ] **Step 4: Live smoke**
+
+Bring up the dev stack and verify metrics flow into Prometheus / VictoriaMetrics:
+
+```bash
+docker compose -f grafana/docker-compose.yaml up -d
+make run    # or whatever the makefile target is
+```
+
+Add a scrape target to `grafana/prometheus.yml` for `host.docker.internal:9091` (Mac) or the host IP (Linux), reload Prometheus (`docker compose restart prometheus`), and verify in the Prometheus UI's Targets page that the new target is `UP`. Query `http_server_request_duration_seconds_count` and confirm series with `http_route` labels appear after issuing a few requests.
+
+Stop the dev stack.
+
+- [ ] **Step 5: Commit (if any docs changed during smoke)**
+
+If the Prometheus scrape config got a new entry, commit it:
+
+```bash
+git add grafana/prometheus.yml
+git commit -m "chore(grafana): scrape superposition metrics endpoint"
+```
+
+Otherwise, no commit.
+
+---
+
+## Notes for self-review (already incorporated)
+
+- **Spec coverage.** Every section of the spec maps to a task: §5 architecture → Tasks 4, 18; §6 module structure → Tasks 4, 11, 12, 13, 14, 15, 16; §7 dependencies → Tasks 1, 3; §8 metric definitions → Tasks 7, 11, 14, 15, 16; §9 middleware mechanics → Tasks 6, 8, 9, 10, 11; §10 saturation collectors → Tasks 14, 15, 16; §11 configuration → Task 5; §12 testing strategy → Tasks 19, 20; §13 rollout — handled at deployment time, not in code (env var defaults); §14 future work — explicitly out of scope.
+- **Type consistency.** `HttpMeters` (Task 7), `ObservabilityConfig`/`LabelConfig` (Task 5), `Observability` (Tasks 4, 7), and the helper functions in `middleware.rs` (Tasks 6–11) all use consistent names across tasks.
+- **Build-flag duality.** Task 16 explicitly tests both with and without `tokio_unstable`. Task 22 retests this at the end as a regression check.
+- **Auth bypass mechanism.** Task 17 wires the health paths into the existing `tenant_middleware_exclusion_list` machinery (verified in Task spec §5.2 against `crates/service_utils/src/middlewares/auth_n.rs:44–60`), not the incorrect "register before auth_n" pattern that was in an earlier draft of the spec.
diff --git a/docs/superpowers/specs/2026-05-10-otel-golden-signals-middleware-design.md b/docs/superpowers/specs/2026-05-10-otel-golden-signals-middleware-design.md
new file mode 100644
index 000000000..a511c0841
--- /dev/null
+++ b/docs/superpowers/specs/2026-05-10-otel-golden-signals-middleware-design.md
@@ -0,0 +1,526 @@
+# OpenTelemetry Golden-Signals Middleware
+
+- **Date:** 2026-05-10
+- **Status:** Design — shipped with deviations (see §0)
+- **Owner:** Natarajan Kannan
+- **Target crate:** `service_utils`
+- **Reference TSDB:** VictoriaMetrics (single-node `vmsingle`); design is TSDB-agnostic
+
+## 0. Post-implementation deviations
+
+The PR shipped with the following changes versus the design captured below. The body of this document is preserved as the original design rationale.
+
+- **Health endpoints (`/healthz`, `/livez`, `/readyz`) dropped.** The pre-existing `GET /health` already serves the up-check role; the k8s-conventional liveness/readiness split can be added in a follow-up PR when an actual deployment consumes it. This makes §5.2 ("Auth bypass for health endpoints"), the `health_endpoints()` API in §6, and Task 12/17 of the plan obsolete.
+- **`tokio_unstable` flag, `tokio-metrics` dep, and `.cargo/config.toml` removed.** Tokio 1.50 exposes `Handle::metrics().num_workers()`, `.global_queue_depth()`, and `.worker_total_busy_duration(i)` as stable APIs (the last gated on `target_has_atomic = "64"`, like tokio itself does). `saturation::tokio_runtime` reads `Handle::metrics()` directly inside each observable callback — no background sampler, no `RuntimeMonitor`, no atomics snapshot.
+- **`runtime.tokio.workers.busy_ratio` replaced with `runtime.tokio.workers.busy.time`.** Exposes cumulative busy time in seconds as a monotonic OTel Counter (summed across workers); Prometheus computes saturation via `rate(...) / num_workers` at query time. Same semantic, Prom-idiomatic.
+- **`opentelemetry-semantic-conventions` dependency removed.** The handful of attribute names we use are inlined as string literals.
+- **`SaturationDeps::tokio_collect_interval` field removed.** No background sampler → no interval to configure. `SUPERPOSITION_METRICS_COLLECT_INTERVAL` still controls the OTLP periodic-reader cadence.
+- **`tenant_middleware_exclusion_list` reverted to env-only.** With health endpoints removed, there's no need to extend it programmatically.
+
+## 1. Background
+
+Superposition's Actix-web HTTP API has structured tracing via `tracing-actix-web` and a `RequestResponseLogger` middleware that emits a single `info!(latency = …, "GoldenSignal")` log line per request. There is no Prometheus/OpenTelemetry client, no `/metrics` endpoint, and no per-process gauge for in-flight work, DB pool state, or runtime saturation. The repository ships a `grafana/` directory with Prometheus + Grafana docker-compose and a Python `custom-exporter`, but no application metrics flow through it.
+
+This design adds first-class metrics exposition for the four [Google SRE golden signals](https://sre.google/sre-book/monitoring-distributed-systems/) — **latency, traffic, errors, saturation** — using OpenTelemetry, with VictoriaMetrics as the reference scrape target. Instrumentation is applied via Actix middleware so any existing or future API endpoint is covered automatically.
+
+## 2. Goals
+
+1. Expose Prometheus-format metrics for every HTTP route on the main API, covering latency / traffic / errors / saturation, without per-handler code changes.
+2. Emit OpenTelemetry semantic-convention metric names so any OTel-native backend (VictoriaMetrics, Prometheus, Grafana Mimir, SigNoz, OpenObserve, Datadog, Honeycomb, …) can ingest them.
+3. Support both **pull** (Prometheus scrape on a dedicated port) and **push** (OTLP HTTP/gRPC) exposition; users choose at deployment time via standard OTel env vars.
+4. Capture saturation signals beyond HTTP — DB connection pool, Redis connection pool, Tokio runtime — so a real "is this process overloaded?" view is possible.
+5. Keep the per-request overhead low (single-digit microseconds) and keep cardinality bounded by design.
+6. Provide a clean configuration surface so operators can disable high-cardinality labels (notably `workspace_id`) without code changes.
+
+## 3. Non-goals
+
+- **Trace correlation via exemplars.** Linking percentile spikes to specific traces requires `tracing-opentelemetry` to be wired through the existing tracing setup. Out of scope for this spec; a follow-up.
+- **OTLP traces export.** The same SDK init code is structured to host trace export later, but this spec covers metrics only.
+- **Per-tenant separate histograms.** Considered (option D in §11.1) and deferred until the global histogram's cardinality budget proves tight in production.
+- **Grafana dashboards.** A separate PR will commit JSON dashboards under `grafana/dashboards/` covering the four golden-signal panels.
+- **Alert rules.** A separate PR will commit VM/Prometheus alert rule YAML.
+- **Removing the existing `info!(… "GoldenSignal")` log.** Stays for now; downstream tooling may consume it. Marked for removal once dashboards have migrated.
+- **Instrumenting non-HTTP work** (background jobs, DB query timing per query). Out of scope for v1.
+
+## 4. Decisions summary
+
+| Decision | Choice | Rationale |
+|---|---|---|
+| Client library | OpenTelemetry SDK + Prometheus exporter | TSDB-agnostic; future-proof for OSS users; future-proofs unified traces+metrics. |
+| TSDB (reference) | VictoriaMetrics (`vmsingle`) | Cheap to operate; Prom-compatible; cluster path exists if needed. |
+| Exposition transport | Prometheus scrape on dedicated port + optional OTLP push | Pull-by-default for self-hosted users; OTLP path unlocks every OTel-native backend. |
+| Labels on HTTP metrics | `route × method × status × org × workspace` | Tenant-level slicing in metrics; workspace label is env-disable-able for users with very high workspace counts. |
+| Saturation signals | HTTP active requests + HTTP busy duration + Tokio runtime + DB pool + Redis pool | Multiple independent signals avoid single-metric blind spots. Host-level (CPU/mem/FD) stays with `node-exporter`. |
+| Where `/metrics` lives | Separate listener on `SUPERPOSITION_METRICS_PORT` (default `9091`) | Network-policy isolation; scrape requests don't pollute the app's own metrics; no auth interaction. |
+| Where `/healthz` lives | Main app port `8080`, paths added to `auth_n`'s exception set | Probes exercise the real user-facing port. |
+| Module location | `crates/service_utils/src/observability.rs` (+ `observability/` for submodules, no `mod.rs`) | Matches existing convention for cross-cutting concerns; modern Rust 2018+ module layout. |
+| Build config | `.cargo/config.toml` adds `--cfg tokio_unstable` workspace-wide | Required by `tokio-metrics` runtime instrumentation. |
+
+## 5. Architecture
+
+A new module **`service_utils::observability`** owns three pieces:
+
+1. **`init()`** — called once from `main.rs` early in startup. Builds the OTel `MeterProvider` with two readers: a `PrometheusExporter` (renders to `/metrics`) and (if `OTEL_EXPORTER_OTLP_ENDPOINT` is set) a periodic OTLP push exporter. Returns an `Observability` handle owning the registry, a cloned `Meter`, and shutdown hooks.
+
+2. **`MetricsMiddleware`** — Actix `Transform`/`Service` pair wrapping every request on the main server. Records:
+   - `http.server.request.duration` (histogram, seconds)
+   - `http.server.busy.duration` (counter, seconds)
+   - `http.server.active_requests` (UpDownCounter)
+
+3. **`saturation::*`** — observable-gauge callbacks (no background tasks for r2d2 / fred) plus one `tokio::spawn` for `tokio-metrics` runtime polling. All emit OTel-namespaced metrics.
+
+A second component, **`metrics_server`**, is a separate `actix_web::HttpServer` on `SUPERPOSITION_METRICS_PORT` that exposes:
+
+- `GET /metrics` — Prometheus exposition rendered from the OTel registry
+
+The main server (port `8080`) gets one new `.wrap(MetricsMiddleware::new(meter.clone()))` line and three new route registrations for `/healthz`, `/livez`, `/readyz`.
+
+### 5.1 Data flow
+
+```text
+[request on :8080]
+   ├── tracing-actix-web ─→ span
+   ├── (auth_n / auth_z) ─→ extensions: org_id, workspace_id
+   ├── MetricsMiddleware ─→ start timer, inc active_requests (RAII guard)
+   │     └── handler runs
+   └── MetricsMiddleware ─→ record histogram, add busy_duration, dec active_requests, emit attributes
+
+[scrape on :9091/metrics] ←── PrometheusExporter ←── MeterProvider ←── (HTTP middleware + saturation collectors)
+                                                                  └─→ (optional) OTLP HTTP/gRPC push to OTEL_EXPORTER_OTLP_ENDPOINT
+
+[saturation, callback-driven]
+   ObservableGauge.with_callback(|obs| obs.observe(pool.state(), …))   // r2d2, fred
+   ObservableGauge reads from AtomicU64 written by a 10s tokio::spawn  // tokio-metrics
+```
+
+### 5.2 Middleware ordering (critical)
+
+`MetricsMiddleware` must run *outside* `auth_n` / `auth_z` / `OrgWorkspaceMiddlewareFactory` so that, when emitting metrics in the response phase, it can read `org_id` / `workspace_id` from request extensions. In Actix, the last `.wrap()` runs first on requests, so the registration chain in `main.rs` should look like (matching the existing convention noted at lines 204–219 of `main.rs`):
+
+```rust
+App::new()
+    .service(/* main api scopes */)
+    .service(health_endpoints())                  // /healthz /livez /readyz
+    // Auth innermost so outer middlewares still run on auth failures.
+    .wrap(auth_z.clone())
+    .wrap(auth_n.clone())
+    .wrap(/* DefaultHeaders, Compress as today */)
+    .wrap(RequestResponseLogger)
+    .wrap(MetricsMiddleware::new(meter.clone(), label_cfg))   // observability — outermost wrap
+    .wrap(TracingLogger::<CustomRootSpanBuilder>::new())      // outermost: span covers everything
+```
+
+**Auth bypass for health endpoints.** `auth_n` (`crates/service_utils/src/middlewares/auth_n.rs:44–60`) returns `Login::None` when the matched path is in its exception set. The existing `/health` route uses this mechanism. The new `/healthz`, `/livez`, `/readyz` paths are added to the same exception set construction site (the call site that builds the `HashSet<String>` passed into `auth_n`). With the exception in place, requests to health endpoints traverse all the middlewares above (so `MetricsMiddleware` does observe them — desirable) but `auth_n` short-circuits authentication and `auth_z` follows suit.
+
+## 6. Module structure
+
+All new code under `crates/service_utils`:
+
+```text
+crates/service_utils/src/
+  observability.rs              -- pub use surface: init(), Observability, shutdown(), errors
+  observability/
+    config.rs                   -- ObservabilityConfig parsed from env
+    meters.rs                   -- typed handles: HttpMeters, DbMeters, RedisMeters, RuntimeMeters
+    middleware.rs               -- MetricsMiddleware (Transform + Service + InFlightGuard)
+    metrics_server.rs           -- HttpServer on SUPERPOSITION_METRICS_PORT exposing /metrics
+    health.rs                   -- /healthz /livez /readyz handlers
+    saturation.rs               -- spawn entry: register_saturation_observers(...)
+    saturation/
+      db_pool.rs                -- r2d2 ObservableGauge callbacks
+      redis_pool.rs             -- fred ObservableGauge callbacks (cfg-gated on Redis configured)
+      tokio_runtime.rs          -- cfg(tokio_unstable); 10s poll task + AtomicU64 → ObservableGauge
+```
+
+Files use the modern Rust 2018+ module layout (no `mod.rs`). `crates/service_utils/src/middlewares/` is left untouched and continues to use whatever pattern it currently uses.
+
+### 6.1 Public API sketch
+
+```rust
+// observability.rs
+pub struct Observability { /* meter_provider, registry, otlp_pipeline, shutdown_handles */ }
+
+impl Observability {
+    pub fn init(cfg: ObservabilityConfig) -> Result<Self, ObservabilityError>;
+    pub fn meter(&self) -> opentelemetry::metrics::Meter;
+    pub fn registry(&self) -> std::sync::Arc<prometheus::Registry>;
+    pub fn shutdown(self) -> Result<(), ObservabilityError>;
+}
+
+pub fn metrics_middleware(meter: Meter, cfg: LabelConfig) -> middleware::MetricsMiddleware;
+
+pub fn spawn_metrics_server(
+    registry: std::sync::Arc<prometheus::Registry>,
+    bind: std::net::SocketAddr,
+) -> std::io::Result<actix_web::dev::Server>;
+
+pub fn health_endpoints() -> actix_web::Scope;
+pub fn health_endpoint_paths() -> &'static [&'static str];  // for auth_n exception set
+
+pub mod saturation {
+    pub fn register_observers(
+        meter: &Meter,
+        deps: SaturationDeps,
+    ) -> Result<(), ObservabilityError>;
+}
+
+pub struct SaturationDeps {
+    pub db_pool: Option<DbPoolHandle>,
+    pub redis_client: Option<FredClientHandle>,
+    pub tokio_collect_interval: std::time::Duration,
+}
+```
+
+## 7. Dependencies
+
+Added to root `Cargo.toml` `[workspace.dependencies]` and enabled in `crates/service_utils/Cargo.toml`. Versions pinned to whatever is current and compatible at implementation time; the table below is the intent.
+
+| Crate | Approx version | Purpose |
+|---|---|---|
+| `opentelemetry` | 0.27 | API surface: `Meter`, `Counter`, `Histogram`, `UpDownCounter`, `ObservableGauge` |
+| `opentelemetry_sdk` | 0.27 | SDK: `MeterProvider`, periodic readers, resource detection |
+| `opentelemetry-prometheus` | 0.27 | Bridge OTel → `prometheus::Registry` for scrape exposition |
+| `opentelemetry-otlp` | 0.27 | Optional OTLP HTTP/gRPC push exporter |
+| `opentelemetry-semantic-conventions` | 0.27 | String constants for attributes (`HTTP_ROUTE`, etc.) |
+| `prometheus` | 0.13 | Required by `opentelemetry-prometheus` for `Registry` and `TextEncoder` |
+| `tokio-metrics` | 0.3 | Runtime metrics; gated by `cfg(tokio_unstable)` |
+
+`fred` already has its `metrics` feature available; we will enable it in `service_utils/Cargo.toml` at implementation time.
+
+## 8. Metric definitions
+
+All names follow OpenTelemetry semantic conventions where they exist; saturation metrics use OTel namespaces (`db.client.*`, `runtime.*`). The Prometheus exporter translates dots to underscores and appends `_seconds` to histograms with unit `s`, etc.
+
+### 8.1 HTTP — golden signals
+
+#### Latency, traffic, errors
+
+One histogram covers all three. Traffic is `rate(_count)`; errors are `rate(_count{status_code=~"5.."})`. No separate counter is needed.
+
+| Field | Value |
+|---|---|
+| Name | `http.server.request.duration` |
+| Type | Histogram (f64, seconds) |
+| Unit | `s` |
+| Buckets (explicit) | `[0.005, 0.025, 0.1, 0.25, 0.5, 1, 2.5, 5, 10]` (9 + `+Inf` = 10 buckets) |
+| Attributes | `http.request.method`, `http.response.status_code`, `http.route`, `sp.org_id`*, `sp.workspace_id`* |
+
+\* env-controlled, default on. Disable: `SUPERPOSITION_METRICS_LABEL_ORG=false`, `SUPERPOSITION_METRICS_LABEL_WORKSPACE=false`.
+
+**Bucket rationale.** Most p50/p95/p99 for a config-fetch service land in 5 ms – 500 ms; the 1 s / 2.5 s / 5 s / 10 s buckets exist to detect tail badness, not to give resolution there. Halving from OTel's 15-bucket default cuts series count nearly in half — a direct cardinality win.
+
+**Derived expressions** (PromQL/MetricsQL):
+
+```promql
+# Traffic — requests/sec by route
+sum(rate(http_server_request_duration_seconds_count[1m])) by (http_route)
+
+# Error rate — 5xx fraction by route
+sum(rate(http_server_request_duration_seconds_count{http_response_status_code=~"5.."}[1m])) by (http_route)
+  /
+sum(rate(http_server_request_duration_seconds_count[1m])) by (http_route)
+
+# Latency — p99 by route
+histogram_quantile(0.99,
+  sum(rate(http_server_request_duration_seconds_bucket[1m])) by (le, http_route))
+```
+
+#### Saturation — HTTP
+
+Two metrics, each capturing a different aspect:
+
+| Field | Value |
+|---|---|
+| Name | `http.server.busy.duration` |
+| Type | Counter (f64, seconds) |
+| Unit | `s` |
+| Attributes | `http.request.method` |
+| Semantics | On each completed request, add elapsed seconds. `rate(...)` over a window gives **time-averaged request concurrency** (Little's Law). Insensitive to scrape aliasing. |
+
+| Field | Value |
+|---|---|
+| Name | `http.server.active_requests` |
+| Type | UpDownCounter |
+| Attributes | `http.request.method` |
+| Semantics | OTel semconv standard. Instantaneous value at scrape time. **Note:** for sub-100ms services this metric aliases badly; not the primary saturation signal. Kept for semconv compliance and dashboards that expect it. |
+
+`http.server.busy.duration` is the smooth, alert-safe saturation signal. `rate(http_server_busy_duration_seconds_total[1m])` is the average request concurrency over the last minute and can exceed worker count for I/O-bound work — that is expected, not a bug, because Tokio workers are not 1:1 with requests.
+
+### 8.2 DB pool saturation (`saturation::db_pool`)
+
+OTel `db.client.*` semantic conventions. Backed by `r2d2::Pool::state()` via observable callbacks — purely passive instrumentation, no changes at `pool.get()` call sites.
+
+| Name | Type | Attributes | Source |
+|---|---|---|---|
+| `db.client.connections.usage` | UpDownCounter (observable) | `state="idle"\|"used"`, `pool.name` | `state.idle_connections`; `state.connections - state.idle_connections` |
+| `db.client.connections.max` | Gauge (observable) | `pool.name` | `pool.max_size()` |
+
+`pool.name` is `"primary"` initially; the API supports multiple pools later.
+
+**Not in v1** (deferred to follow-up):
+
+- `db.client.connection.wait.duration` (histogram) — would require timing every `pool.get()` invocation across the codebase.
+- `db.client.connections.pending_requests` (gauge) — would require atomic-counter instrumentation at every `pool.get()` call site.
+
+Both become cheap once a typed pool wrapper exists (a single `App`-level helper that wraps `r2d2::Pool` and is the only way connections are obtained); that wrapper is a separate codebase change and is not in scope here. In v1, DB-pool saturation is signalled by `connections.usage` ratios — `connections.usage{state="used"} / connections.max` near 1.0 means saturation. The request-duration histogram tail will spike under DB starvation regardless.
+
+### 8.3 Redis pool saturation (`saturation::redis_pool`)
+
+Compiled out via `cfg` if Redis is not configured. Names mirror the DB pool. Backed by `fred`'s built-in metrics surface.
+
+| Name | Type | Attributes |
+|---|---|---|
+| `redis.client.connections.usage` | UpDownCounter (observable) | `state="idle"\|"used"`, `pool.name` |
+| `redis.client.commands.in_flight` | Gauge (observable) | `pool.name` |
+| `redis.client.command.latency` | Histogram (s) | `pool.name`, `command_kind="read"\|"write"\|"admin"` |
+
+Exact mapping from `fred` stats to these metrics is finalized at implementation time; if any field is unavailable, that metric is dropped from v1 with a TODO.
+
+### 8.4 Tokio runtime saturation (`saturation::tokio_runtime`)
+
+`#[cfg(tokio_unstable)]`-gated. Backed by `tokio_metrics::RuntimeMonitor`, polled every `SUPERPOSITION_METRICS_COLLECT_INTERVAL` (default 10 s) into `AtomicU64`s read by observable-gauge callbacks.
+
+| Name | Type | Attributes | Source |
+|---|---|---|---|
+| `runtime.tokio.workers` | Gauge (observable) | — | `num_workers` |
+| `runtime.tokio.workers.busy_ratio` | Gauge (observable, f64) | — | `total_busy_duration / total_polls / interval` |
+| `runtime.tokio.global_queue.depth` | Gauge (observable) | — | `global_queue_depth` |
+| `runtime.tokio.tasks.alive` | Gauge (observable) | — | `live_tasks_count` if available; otherwise dropped |
+
+If a contributor builds without `--cfg tokio_unstable`, the module compiles to a no-op stub; everything else still works.
+
+### 8.5 Resource attributes
+
+Set once at `MeterProvider` init, applied to every metric.
+
+| Attribute | Source |
+|---|---|
+| `service.name` | `OTEL_SERVICE_NAME` env; default `"superposition"` |
+| `service.version` | `env!("CARGO_PKG_VERSION")` at build time |
+| `service.instance.id` | `SUPERPOSITION_INSTANCE_ID` env; default to hostname |
+| `deployment.environment` | existing env detection (`PROD`/`SANDBOX`/`DEV`) |
+| `OTEL_RESOURCE_ATTRIBUTES` | merged in if set (standard OTel env var) |
+
+## 9. Middleware mechanics
+
+### 9.1 Route template extraction
+
+Actix exposes `req.match_pattern() -> Option<String>` returning the registered template (e.g., `/contexts/{context_id}`), not the raw URI. Three cases for `http.route`:
+
+| Match outcome | `http.route` value |
+|---|---|
+| Pattern matched | the pattern string |
+| No route matched (404 from no match) | `__not_found__` |
+| Static asset / Leptos frontend route | `__static__` |
+
+Sentinels are constants — finite set, bounded cardinality.
+
+`match_pattern()` is only populated after routing resolves. The middleware reads it in the response phase. The active-requests increment on entry uses `http.request.method` only, which is available immediately, so no ordering issue.
+
+### 9.2 Label extraction
+
+Read from request extensions during the response phase, set upstream by `OrgWorkspaceMiddlewareFactory`:
+
+```rust
+let org_id = req.extensions().get::<OrgId>().map(|o| o.as_str().to_owned());
+let workspace = req.extensions().get::<WorkspaceName>().map(|w| w.as_str().to_owned());
+```
+
+For each:
+
+| Case | Action |
+|---|---|
+| Present | Emit attribute with the value. |
+| Absent because route does not have one (e.g., org-management routes) | Omit the attribute. Series simply lacks that label — distinct from a value of `""`. |
+| Absent because middleware short-circuited before setting it (401, 403) | Omit the attribute. |
+| `LabelConfig` has the label disabled | Never emit, regardless of presence. |
+
+### 9.3 HTTP method normalization
+
+Per OTel HTTP semconv: known methods (`GET`, `POST`, `PUT`, `DELETE`, `PATCH`, `HEAD`, `OPTIONS`, `TRACE`, `CONNECT`) keep their literal value; anything else collapses to `_OTHER`. Implemented as a small match — no library dependency. Prevents weird clients (`XPROPFIND`, `INVALID-㊙️`) from blowing up cardinality.
+
+### 9.4 Status code source
+
+| Outcome | Status used |
+|---|---|
+| Normal response | `res.status().as_u16()` |
+| Handler error converted by Actix | the converted response's status |
+| Panic (caught by Actix's panic handler → 500) | `500`, with `error.type="panic"` set on the histogram observation only |
+
+### 9.5 Active-requests guard (panic-safe)
+
+```rust
+struct InFlightGuard {
+    counter: UpDownCounter<i64>,
+    method_attr: KeyValue,
+    decremented: AtomicBool,
+}
+
+impl Drop for InFlightGuard {
+    fn drop(&mut self) {
+        if !self.decremented.swap(true, Ordering::Relaxed) {
+            self.counter.add(-1, &[self.method_attr.clone()]);
+        }
+    }
+}
+```
+
+On entry: increment, build guard, store in the request future. On normal completion: explicitly decrement (sets the flag). On client disconnect / future drop / panic upstream: `Drop` decrements as a fallback. The histogram is recorded only on normal completion — a half-finished request's latency is not meaningful.
+
+### 9.6 Endpoints excluded from instrumentation
+
+Hard-coded in v1 (configurable later):
+
+- `/metrics` — physically isolated on the metrics port; cannot reach the middleware.
+- Static asset routes — emit `__static__` for `http.route` instead of being skipped, so a flood is still visible.
+- `/healthz` `/livez` `/readyz` — instrumented (we want to observe them); auth bypass via `auth_n`'s existing path exception set. Their own latency contributes to `http.server.request.duration` under their own routes.
+
+### 9.7 Per-request overhead
+
+Expected:
+
+- ~3 hashmap lookups on `req.extensions()`
+- 2 system clock reads (`Instant::now()` on entry/exit)
+- 1 atomic increment + 1 atomic decrement on the active-requests gauge
+- 1 histogram `record()` call (lock-free in OTel SDK 0.27+)
+- 1 counter `add()` call for `http.server.busy.duration`
+
+**Hot-path allocations:** attribute *keys* are interned via `opentelemetry::Key::from_static_str`; attribute *values* (route, org, workspace) require `String` allocations because they are dynamic. This is unavoidable given Q3's label choices and is intrinsic to OTel attribute construction.
+
+Total expected overhead: **single-digit microseconds per request**, well below the millisecond scale of any handler.
+
+## 10. Saturation collector internals
+
+### 10.1 Pull-on-observation pattern
+
+OTel's `ObservableGauge` and `ObservableCounter` invoke a callback at collection time (every scrape, every push interval). For sources that are cheap to read synchronously (`r2d2::Pool::state()`, `fred` stats), no background task is needed:
+
+```rust
+let pool_clone = pool.clone();
+meter
+    .u64_observable_gauge("db.client.connections.usage")
+    .with_callback(move |observer| {
+        let s = pool_clone.state();
+        observer.observe(s.idle_connections as u64,
+                         &[KeyValue::new("state", "idle"),
+                           KeyValue::new("pool.name", "primary")]);
+        observer.observe((s.connections - s.idle_connections) as u64,
+                         &[KeyValue::new("state", "used"),
+                           KeyValue::new("pool.name", "primary")]);
+    })
+    .init();
+```
+
+### 10.2 Tokio-metrics polling exception
+
+`tokio_metrics::RuntimeMonitor::intervals()` is a delta iterator — it returns stats since the last call, not absolute values. This requires one `tokio::spawn` polling at `SUPERPOSITION_METRICS_COLLECT_INTERVAL` (default 10 s). The task writes derived values into `AtomicU64`s; observable-gauge callbacks read those atomics. Single background task in the whole observability subsystem.
+
+### 10.3 Build configuration
+
+Workspace `.cargo/config.toml`:
+
+```toml
+[build]
+rustflags = ["--cfg", "tokio_unstable"]
+```
+
+Without the flag, the `saturation::tokio_runtime` module compiles to a no-op stub; everything else still works. `README.md` and `makefile` get a one-line callout. `tokio_unstable` only enables additional Tokio APIs — no behavioural change for existing code.
+
+CI runs `cargo check` both with and without the flag to keep the no-op stub honest.
+
+## 11. Configuration surface
+
+All env-driven; no config file. Applies to the main `superposition` binary.
+
+| Var | Default | Purpose |
+|---|---|---|
+| `SUPERPOSITION_METRICS_ENABLED` | `true` | Master switch. `false` ⇒ no init, no middleware, no listener. |
+| `SUPERPOSITION_METRICS_PORT` | `9091` | Port for the `/metrics` listener. |
+| `SUPERPOSITION_METRICS_BIND` | `0.0.0.0` | Bind address for the metrics listener. Set to `127.0.0.1` for loopback-only. |
+| `SUPERPOSITION_METRICS_LABEL_ORG` | `true` | Include `sp.org_id` attribute on HTTP metrics. |
+| `SUPERPOSITION_METRICS_LABEL_WORKSPACE` | `true` | Include `sp.workspace_id` attribute on HTTP metrics. |
+| `SUPERPOSITION_METRICS_COLLECT_INTERVAL` | `10s` | Tokio runtime metrics poll interval (only used if `tokio_unstable`). Parsed by `humantime`. |
+| `SUPERPOSITION_INSTANCE_ID` | hostname | `service.instance.id` resource attribute. |
+| `OTEL_EXPORTER_OTLP_ENDPOINT` | unset | Standard OTel env var. If set, enables OTLP push exporter. |
+| `OTEL_EXPORTER_OTLP_PROTOCOL` | `http/protobuf` | Standard OTel env var. |
+| `OTEL_EXPORTER_OTLP_HEADERS` | unset | Standard OTel env var. |
+| `OTEL_SERVICE_NAME` | `superposition` | Standard OTel env var. |
+| `OTEL_RESOURCE_ATTRIBUTES` | unset | Standard OTel env var; merged into resource. |
+
+Env reading uses the existing `service_utils` env-loading idiom, matching what `auth_n` etc. already do.
+
+### 11.1 Cardinality budget (worked)
+
+For the HTTP request-duration histogram, per active workspace × org pair in steady state:
+
+- ~30 routes × ~3 methods used × ~5 status codes seen × 12 series-per-bucket-set = **~5,400 series ceiling**, with realized usage typically 10–20 % → **~540–1,080 actual series per workspace**.
+
+Other metrics (active_requests, busy_duration, saturation gauges) are method-only or unlabeled → ~30 series total, independent of tenant count.
+
+So adding a workspace ≈ 600–1,100 new series. At 1,000 active workspaces ≈ **600 k – 1.1 M series** for the HTTP histogram. Comfortably within `vmsingle` on 16 GB.
+
+If workspace count grows beyond ~5,000 active, set `SUPERPOSITION_METRICS_LABEL_WORKSPACE=false` and slice by workspace via traces instead — no code change required.
+
+## 12. Testing strategy
+
+### 12.1 Unit tests (`crates/service_utils/src/observability/`)
+
+| Test module | What it asserts |
+|---|---|
+| `middleware::tests::label_extraction` | Table-driven: request fixtures with various extension states → expected `Vec<KeyValue>` produced. |
+| `middleware::tests::method_normalization` | `XPROPFIND` → `_OTHER`; known methods pass through. |
+| `middleware::tests::route_template_sentinels` | Unmatched path → `__not_found__`; static path → `__static__`. |
+| `middleware::tests::active_requests_panic_safety` | Handler that panics still decrements the gauge via `Drop`. |
+| `middleware::tests::label_config_disabled` | With `with_workspace_label=false`, attribute is not emitted even when present in extensions. |
+| `config::tests::env_parsing` | Env-var combinations produce expected `ObservabilityConfig`. |
+
+### 12.2 Integration test (`crates/service_utils/tests/observability_integration.rs`)
+
+1. Boot a test app: `MetricsMiddleware` + a small `/test` scope with several routes + the metrics server on a random port.
+2. Issue requests of varying methods, paths, status codes (including 404 to a non-route).
+3. Scrape the metrics port; parse the Prometheus exposition with the `prometheus-parse` crate (or equivalent).
+4. Assert:
+   - All expected metric names exist.
+   - `http_server_request_duration_seconds_count` per `(route, method, status)` matches the issued count.
+   - `__not_found__` route appears for the 404.
+   - `http_server_active_requests` returns to 0 after all requests complete.
+   - `http_server_busy_duration_seconds_total` is approximately `Σ request_duration` (within 10 %).
+5. Smoke-test the OTLP pipeline against a mock OTLP receiver if cheap; otherwise gate behind `#[ignore]` and document.
+
+### 12.3 Cardinality regression test
+
+After §12.2 scenarios run, count distinct series in the exposition. Fail the test if total exceeds a budget (initial: 200 series for the test scenario). Catches accidental high-cardinality labels in code review.
+
+## 13. Rollout
+
+| Phase | Duration | Action | Exit criterion |
+|---|---|---|---|
+| **1 — code lands disabled** | 1 PR | Land code with `SUPERPOSITION_METRICS_ENABLED=false` as the *deployed* default in prod environments (override on in CI/staging). | Process startup time unchanged; per-request overhead within noise on existing locust suite; `/metrics` exposition parses cleanly in CI. |
+| **2 — staging** | 48 h | `SUPERPOSITION_METRICS_ENABLED=true` in staging. | VM ingest rate stable; series count matches §11.1 estimate to within 30 %; no scrape errors. |
+| **3 — prod, no workspace label** | 1 week | Prod on, `SUPERPOSITION_METRICS_LABEL_WORKSPACE=false`. | VM headroom > 30 %; alerts (when defined in follow-up PR) firing as expected. |
+| **4 — prod, full** | — | `SUPERPOSITION_METRICS_LABEL_WORKSPACE=true`. | Steady-state. |
+
+Existing `info!(latency, "GoldenSignal")` log line at `crates/service_utils/src/middlewares/request_response_logging.rs:84` stays for now. Marked for removal once Phase 4 is steady and dashboards have migrated.
+
+## 14. Future work (not implemented)
+
+- **Trace correlation via exemplars.** When `tracing-opentelemetry` bridges traces into the same SDK, the histogram emits exemplars linking percentile spikes to specific traces. Free win once the bridge exists.
+- **Per-tenant separate histogram** (option D from Q3 brainstorm). If the global histogram's cardinality budget proves tight, add a second `http_server_request_duration_by_workspace_seconds` with fewer buckets, retaining tenant slicing without paying the cost on the global histogram.
+- **OTLP traces export.** The `Observability::init` shape is structured to host trace export later.
+- **Grafana dashboards.** JSON dashboards under `grafana/dashboards/` covering the four golden-signal panels. Separate PR.
+- **Alert rules.** VM/Prometheus alert rule YAML covering: error rate > X %, p99 latency > X ms, DB pool wait p99 > X ms, Tokio busy-ratio sustained > 0.8. Separate PR.
+- **Per-route overhead controls.** A route-level allowlist/denylist in `LabelConfig` so noisy or high-volume internal routes can be sampled or excluded at runtime without redeploying.
+- **DB pool wait visibility.** `db.client.connection.wait.duration` (histogram) and `db.client.connections.pending_requests` (gauge), unlocked by a typed pool wrapper that is the only way to obtain a connection. One-time codebase migration, then both metrics fall out for free.
+- **Removing the existing `GoldenSignal` log line.** Once dashboards are migrated, the log line in `request_response_logging.rs:84` becomes redundant.
+
+## 15. Risks
+
+| Risk | Mitigation |
+|---|---|
+| OTel Rust SDK 0.27 has historically had churn between minor versions; metrics API was stabilized but exporter integrations may shift. | Pin to a single minor version; central import via `service_utils::observability`; bump in a single PR with the integration test as the gate. |
+| `tokio_unstable` workspace flag affects all crates and may interact with future Tokio releases. | CI matrix runs `cargo check` with and without the flag. The `saturation::tokio_runtime` module is the only consumer; everything else compiles either way. |
+| Workspace label cardinality grows unexpectedly (workspace creation rate, churn from short-lived workspaces). | `SUPERPOSITION_METRICS_LABEL_WORKSPACE=false` is a runtime opt-out; rollout Phase 3 lands with it off. |
+| OTel attribute construction allocates `String` on the hot path. | Confirmed unavoidable for dynamic attribute values; benchmarked overhead expected single-digit microseconds. If profiling shows a problem, switch to `Cow<'static, str>` for attribute *values* where possible (e.g., method, status code) and keep allocations only for `route`/`org`/`workspace`. |
+| `r2d2`'s waiter count and wait duration require call-site instrumentation; v1 has only `connections.usage` ratios. | Acceptable for v1: a usage ratio near `connections.max` signals saturation, and the request-duration histogram tail will spike under DB starvation. A typed pool wrapper in a follow-up unlocks both `wait.duration` and `pending_requests` cheaply. |
+| `fred` metrics surface may not map 1:1 to OTel `db.client.*` style attributes. | Mapping is finalized at implementation time; any unavailable field is dropped from v1 with a TODO and noted in the PR description. |
+| Health-check probes on the main port get instrumented and add noise to `http_server_request_duration_seconds`. | Acceptable: probe cardinality is fixed (3 routes × 1 method × 1 status), and observing probe latency is desirable. |