diff --git a/Cargo.lock b/Cargo.lock
index 7df7802d..a0a51631 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4142,23 +4142,8 @@ dependencies = [
  "weaver-analysis",
  "weaver-core",
  "weaver-database",
- "weaver-embedding",
- "weaver-trace",
-]
-
-[[package]]
-name = "weaver-embedding"
-version = "0.1.0"
-dependencies = [
- "weaver-core",
- "weaver-spu",
-]
-
-[[package]]
-name = "weaver-inference"
-version = "0.1.0"
-dependencies = [
  "weaver-spu",
+ "weaver-trace",
 ]
 
 [[package]]
@@ -4188,8 +4173,7 @@ dependencies = [
  "weaver-core",
  "weaver-database",
  "weaver-demo",
- "weaver-embedding",
- "weaver-inference",
+ "weaver-spu",
  "weaver-trace",
 ]
 
diff --git a/Cargo.toml b/Cargo.toml
index c982a4b2..94d499e0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,12 +3,10 @@ resolver = "2"
 members = [
     "crates/weaver-core",
     "crates/weaver-interface",
-    "crates/weaver-inference",
     "crates/weaver-database",
     "crates/weaver-demo",
     "crates/weaver-trace",
     "crates/weaver-analysis",
-    "crates/weaver-embedding",
     "crates/weaver-spu",
 ]
 
@@ -23,8 +21,10 @@ license = "Apache-2.0"
 weaver-core = { path = "crates/weaver-core" }
 weaver-trace = { path = "crates/weaver-trace" }
 weaver-analysis = { path = "crates/weaver-analysis" }
-weaver-embedding = { path = "crates/weaver-embedding" }
-weaver-inference = { path = "crates/weaver-inference" }
+# `weaver-embedding` and `weaver-inference` were deprecated re-export
+# shells removed in PR-0.5.E. Their content folded into `weaver-spu`
+# (encoder + decoder runtime) and `weaver-core` (Embedder trait).
+# Workspace member count contracted from 9 → 7 per spec §6.
 # `default-features = false` at the workspace level so consumers can
 # explicitly opt out of weaver-spu's default feature set without
 # tripping cargo's "can't override workspace defaults" rule. Today
diff --git a/crates/weaver-core/src/embedder.rs b/crates/weaver-core/src/embedder.rs
index 5221c984..a7a768ef 100644
--- a/crates/weaver-core/src/embedder.rs
+++ b/crates/weaver-core/src/embedder.rs
@@ -7,7 +7,7 @@
 //! lives:
 //!
 //! - **Pre-Phase-1 cutover**: gRPC client to the Python `weaver-embedder.service`
-//!   in `weaver-embedding::grpc_client::EmbeddingClient`.
+//!   in `weaver_spu::encoder::grpc_client_legacy::EmbeddingClient`.
 //! - **Post-Phase-1 cutover**: in-process candle backend
 //!   `weaver-spu::encoder::client::EmbedderClient`.
 //!
@@ -39,7 +39,7 @@
 //!   `Box<dyn std::error::Error>` payload, so this crate doesn't need a
 //!   tonic dep just to express gRPC failures. Backend impls map their
 //!   concrete transport errors into the abstract `Transport` variant via
-//!   `From` impls (see `weaver-embedding::grpc_client`).
+//!   `From` impls (see `weaver_spu::encoder::grpc_client_legacy`).
 
 use async_trait::async_trait;
 
@@ -107,7 +107,7 @@ pub struct LateChunkResult {
 ///
 /// Variants are deliberately abstract over transport so this crate
 /// doesn't pick up a tonic dep just to express gRPC errors. Backend
-/// implementations (e.g., the gRPC client in `weaver-embedding`) map
+/// implementations (e.g., the gRPC client in `weaver_spu::encoder::grpc_client_legacy`) map
 /// their concrete transport errors into [`Self::Transport`] via `From`
 /// impls and the message preserves the upstream error's `Display`.
 #[derive(Debug, thiserror::Error)]
@@ -130,12 +130,12 @@ pub enum EmbeddingError {
 }
 
 // gRPC-transport `From` impls. These live here (not in
-// `weaver-embedding`) because Rust's orphan rule requires foreign-on-
+// `weaver_spu::encoder::grpc_client_legacy`) because Rust's orphan rule requires foreign-on-
 // foreign impls to be in the trait's crate. The cost is one tonic dep
 // in `weaver-core`; the alternative (explicit `.map_err` at every `?`
 // site in the gRPC client) is a larger maintenance burden than the
 // dep. Removed in PR-3.A along with the gRPC client itself, when the
-// last consumer (weaver-embedding) goes away.
+// last consumer (`weaver_spu::encoder::grpc_client_legacy`) goes away in PR-3.A.
 impl From<tonic::transport::Error> for EmbeddingError {
     fn from(e: tonic::transport::Error) -> Self {
         EmbeddingError::Transport(Box::new(e))
@@ -151,7 +151,7 @@ impl From<tonic::Status> for EmbeddingError {
 /// Backend-agnostic embedding service.
 ///
 /// Implementations:
-/// - `weaver_embedding::grpc_client::EmbeddingClient` — Python service over
+/// - `weaver_spu::encoder::grpc_client_legacy::EmbeddingClient` — Python service over
 ///   Unix socket / TCP gRPC (pre-Phase-1 cutover).
 /// - `weaver_spu::encoder::client::EmbedderClient` — Rust in-process
 ///   candle backend (post-Phase-1 cutover).
diff --git a/crates/weaver-database/src/chunking/mod.rs b/crates/weaver-database/src/chunking/mod.rs
index 270cc782..a43e71e4 100644
--- a/crates/weaver-database/src/chunking/mod.rs
+++ b/crates/weaver-database/src/chunking/mod.rs
@@ -12,7 +12,7 @@
 //! belongs alongside the rest of the embedding pipeline. Consumers
 //! that need `LateChunkConfig` / `LateChunkResult` /
 //! `late_chunk_embeddings` import from
-//! `weaver_embedding::late_chunk::*` instead.
+//! `weaver_spu::encoder::late_chunk::*` instead.
 
 mod strategies;
 
diff --git a/crates/weaver-database/src/lib.rs b/crates/weaver-database/src/lib.rs
index 9fc12c34..65debd5d 100644
--- a/crates/weaver-database/src/lib.rs
+++ b/crates/weaver-database/src/lib.rs
@@ -8,7 +8,7 @@ pub mod graph;
 // `weaver-embedding::grpc_client` per
 // `embedder-oxidization-Spec.md` §5.1 (issue #166 / sprint
 // Block A.5). External consumers import from
-// `weaver_embedding::grpc_client` and `weaver_embedding::proto::embedding`.
+// `weaver_spu::encoder::grpc_client_legacy` and `weaver_spu::proto::embedding`.
 // `weaver-database`'s `proto` module continues to generate
 // embedding-service types for backward compat with internal
 // proto-validation tests; a future cleanup PR removes the
diff --git a/crates/weaver-demo/Cargo.toml b/crates/weaver-demo/Cargo.toml
index 4783ef41..214e6da1 100644
--- a/crates/weaver-demo/Cargo.toml
+++ b/crates/weaver-demo/Cargo.toml
@@ -8,7 +8,10 @@ license.workspace = true
 [dependencies]
 weaver-core = { workspace = true }
 weaver-database = { workspace = true }
-weaver-embedding = { workspace = true }
+# `weaver-embedding` removed in PR-0.5.E. Demo uses the relocated
+# encoder types via `weaver-spu` directly + the relocated `Embedder`
+# trait via `weaver-core::embedder`.
+weaver-spu = { workspace = true }
 weaver-trace = { workspace = true }
 weaver-analysis = { workspace = true }
 async-trait = { workspace = true }
diff --git a/crates/weaver-demo/src/bin/embedder_latency.rs b/crates/weaver-demo/src/bin/embedder_latency.rs
new file mode 100644
index 00000000..e1852f57
--- /dev/null
+++ b/crates/weaver-demo/src/bin/embedder_latency.rs
@@ -0,0 +1,381 @@
+// Pre-existing hidden-lifetime pattern at line 345 — not introduced
+// by PR-0.5.E (the binary uses gRPC types whose generic lifetimes
+// rustc began flagging more strictly). Out-of-scope cleanup for the
+// no-op consolidation; allowed at file scope.
+#![allow(elided_lifetimes_in_paths)]
+
+//! Embedder latency characterization for Option B (SuperEgo / ambient
+//! surfacing) feasibility.
+//!
+//! The "latency is the enemy of agency" thesis says the harness's
+//! internal-channel speed must beat the agent's external environment
+//! by ≥1 OOM. The user's calibration: external = network = 1-10 ms;
+//! the harness's transport floor target is the 100 ns range. We are
+//! 4-5 OOM faster on transport-only IPC; what we measure here is
+//! whether the embedder, which is part of the inner loop for
+//! ambient surfacing, blows that budget.
+//!
+//! Two axes that matter:
+//!
+//! - **Input size sweep** — real `output.value` payloads from
+//!   benchmark traces are small (median 2 KB, max ~5 KB). We sweep
+//!   ~50 B → 32 KB to capture the realistic range plus the upper
+//!   tail.
+//! - **Concurrency sweep** — Option B fires the embedder on every
+//!   tool output, and a twin-GPU bench has both agents firing into
+//!   the same embedder. Measuring 1 vs 2 vs 4 simultaneous requests
+//!   shows whether the current GPU2 (RTX 2000 75W) placement
+//!   collapses under contention.
+//!
+//! Output: a markdown table per (size × concurrency) cell with
+//! P50 / P95 / P99 / max round-trip in nanoseconds, plus the
+//! transport delta (round-trip minus server-reported compute) so
+//! the IPC overhead is separable from the GPU cost.
+//!
+//! Per-call data is captured in Unix nanoseconds from
+//! `SystemTime::UNIX_EPOCH` at request emit and response receive,
+//! so future runs can be cross-correlated with packet captures or
+//! CUDA-side instrumentation if/when we add per-stage profiling on
+//! the Python embedder.
+
+use std::env;
+use std::path::PathBuf;
+use std::sync::Arc;
+use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
+
+use tokio::task::JoinSet;
+
+use weaver_spu::encoder::grpc_client_legacy::{EmbeddingClient, EmbeddingClientConfig, EmbeddingEndpoint};
+
+/// Single measured call.
+struct Sample {
+    /// Wall-clock at send (Unix nanoseconds). Captured for future
+    /// cross-process correlation; not consumed by the aggregate
+    /// stats emitted in this run.
+    #[allow(dead_code)]
+    t_send_unix_ns: u128,
+    /// Wall-clock at receive (Unix nanoseconds). Same rationale as
+    /// `t_send_unix_ns`.
+    #[allow(dead_code)]
+    t_recv_unix_ns: u128,
+    /// Round-trip from `Instant::now()` deltas — monotonic, immune to
+    /// wall-clock jumps. This is the load-bearing measurement.
+    roundtrip_ns: u128,
+    /// Server-reported compute time. Service still uses ms; converted
+    /// to ns at the boundary so all subsequent math is in one unit.
+    server_compute_ns: u128,
+    /// Round-trip minus server compute = IPC + serialization +
+    /// scheduler overhead. Negative values are clamped to 0 (the
+    /// server's `duration_ms` is integer-rounded so 0 ms server with
+    /// any positive client roundtrip still yields a sensible delta).
+    transport_overhead_ns: u128,
+}
+
+/// Aggregated stats for a (size × concurrency) cell.
+struct CellStats {
+    n: usize,
+    rt_p50_ns: u128,
+    rt_p95_ns: u128,
+    rt_p99_ns: u128,
+    rt_max_ns: u128,
+    rt_mean_ns: u128,
+    server_p50_ns: u128,
+    server_p95_ns: u128,
+    transport_p50_ns: u128,
+    transport_p95_ns: u128,
+}
+
+impl CellStats {
+    fn from_samples(samples: &[Sample]) -> Self {
+        let mut rt: Vec<u128> = samples.iter().map(|s| s.roundtrip_ns).collect();
+        let mut server: Vec<u128> = samples.iter().map(|s| s.server_compute_ns).collect();
+        let mut transport: Vec<u128> = samples.iter().map(|s| s.transport_overhead_ns).collect();
+        rt.sort_unstable();
+        server.sort_unstable();
+        transport.sort_unstable();
+
+        let pct = |sorted: &[u128], p: f64| -> u128 {
+            if sorted.is_empty() {
+                return 0;
+            }
+            // Nearest-rank percentile — adequate at n=100. We avoid
+            // interpolation so the reported number is an actual
+            // observed value, not a synthetic midpoint.
+            let idx = ((p * (sorted.len() as f64)).ceil() as usize).saturating_sub(1);
+            sorted[idx.min(sorted.len() - 1)]
+        };
+
+        let mean = if rt.is_empty() {
+            0
+        } else {
+            rt.iter().sum::<u128>() / (rt.len() as u128)
+        };
+
+        Self {
+            n: samples.len(),
+            rt_p50_ns: pct(&rt, 0.50),
+            rt_p95_ns: pct(&rt, 0.95),
+            rt_p99_ns: pct(&rt, 0.99),
+            rt_max_ns: *rt.last().unwrap_or(&0),
+            rt_mean_ns: mean,
+            server_p50_ns: pct(&server, 0.50),
+            server_p95_ns: pct(&server, 0.95),
+            transport_p50_ns: pct(&transport, 0.50),
+            transport_p95_ns: pct(&transport, 0.95),
+        }
+    }
+}
+
+/// Format a nanosecond count with the most readable unit.
+fn fmt_ns(ns: u128) -> String {
+    if ns < 1_000 {
+        format!("{ns} ns")
+    } else if ns < 1_000_000 {
+        format!("{:.2} µs", (ns as f64) / 1_000.0)
+    } else if ns < 1_000_000_000 {
+        format!("{:.2} ms", (ns as f64) / 1_000_000.0)
+    } else {
+        format!("{:.3} s", (ns as f64) / 1_000_000_000.0)
+    }
+}
+
+async fn one_call(client: &EmbeddingClient, text: &str, task: &str) -> anyhow::Result<Sample> {
+    let t_send_unix_ns = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .unwrap_or_default()
+        .as_nanos();
+    let t0 = Instant::now();
+
+    // `embed_one` calls `embed` with a single-element batch; the
+    // result carries the server-reported `duration_ms`.
+    let result = client.embed(&[text.to_string()], task, None).await?;
+
+    let elapsed = t0.elapsed();
+    let t_recv_unix_ns = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .unwrap_or_default()
+        .as_nanos();
+
+    let roundtrip_ns = elapsed.as_nanos();
+    let server_compute_ns = (result.duration_ms as u128) * 1_000_000;
+    let transport_overhead_ns = roundtrip_ns.saturating_sub(server_compute_ns);
+
+    Ok(Sample {
+        t_send_unix_ns,
+        t_recv_unix_ns,
+        roundtrip_ns,
+        server_compute_ns,
+        transport_overhead_ns,
+    })
+}
+
+/// Run a (size × concurrency) cell.
+async fn run_cell(
+    client: Arc<EmbeddingClient>,
+    text: Arc<String>,
+    task: &str,
+    iters: usize,
+    concurrency: usize,
+) -> anyhow::Result<Vec<Sample>> {
+    let mut all_samples: Vec<Sample> = Vec::with_capacity(iters);
+    let task_str = task.to_string();
+
+    if concurrency == 1 {
+        for _ in 0..iters {
+            let s = one_call(&client, &text, &task_str).await?;
+            all_samples.push(s);
+        }
+    } else {
+        // Split iters across `concurrency` parallel tasks. Each task
+        // does its own sequential calls — the `concurrency` parameter
+        // models "how many simultaneous in-flight requests is the
+        // embedder receiving," which is what GPU2 contention sees
+        // when both agents fire ambient surfacing at once.
+        let per_worker = iters.div_ceil(concurrency);
+        let mut set = JoinSet::new();
+        for _ in 0..concurrency {
+            let c = client.clone();
+            let t = text.clone();
+            let task_owned = task_str.clone();
+            set.spawn(async move {
+                let mut local: Vec<Sample> = Vec::with_capacity(per_worker);
+                for _ in 0..per_worker {
+                    let s = one_call(&c, &t, &task_owned).await?;
+                    local.push(s);
+                }
+                Ok::<Vec<Sample>, anyhow::Error>(local)
+            });
+        }
+        while let Some(res) = set.join_next().await {
+            let local = res??;
+            all_samples.extend(local);
+        }
+        // Trim any overshoot from the rounding in `per_worker`.
+        all_samples.truncate(iters);
+    }
+
+    Ok(all_samples)
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    // Allow override of the default socket via env, useful for
+    // pointing at a moved embedder later (e.g. GPU0 co-resident
+    // experiment) without recompiling.
+    let socket = env::var("WEAVER_EMBEDDER_SOCKET")
+        .map(PathBuf::from)
+        .unwrap_or_else(|_| PathBuf::from("/run/weaver/embedder.sock"));
+
+    println!("# Embedder latency characterization");
+    println!();
+    println!("**Date:** {}", chrono::Utc::now().to_rfc3339());
+    println!("**Socket:** `{}`", socket.display());
+    println!("**Task:** `retrieval.query`");
+    println!();
+    println!("Per-call timing uses `Instant::now()` for monotonic round-trip and");
+    println!("`SystemTime` (Unix nanoseconds) for cross-process correlation.");
+    println!("Server-reported compute time is in milliseconds; transport overhead");
+    println!("= round-trip − server compute.");
+    println!();
+
+    let cfg = EmbeddingClientConfig {
+        endpoint: EmbeddingEndpoint::Unix(socket),
+        timeout: Duration::from_secs(30),
+        connect_timeout: Duration::from_secs(5),
+    };
+
+    println!("## Connecting…");
+    let client = EmbeddingClient::connect(cfg).await?;
+    let client = Arc::new(client);
+    println!("Connected.");
+    println!();
+
+    // Input size sweep. Sizes chosen from real trace data
+    // (median 2 KB, max ~5 KB) plus an upper-tail synthetic.
+    let sizes: &[(usize, &str)] = &[
+        (50, "small_50B"),
+        (500, "small_500B"),
+        (2_000, "median_2KB"),
+        (5_000, "max_observed_5KB"),
+        (16_000, "upper_tail_16KB"),
+    ];
+
+    let task = "retrieval.query";
+    let warmup_iters = 5;
+    let measured_iters = 100;
+    let concurrencies = [1usize, 2, 4];
+
+    println!("## Warmup");
+    println!();
+    println!("{warmup_iters} calls at the median size to warm the GPU and bring weights into HBM.");
+    println!();
+    let warm_text = Arc::new("a".repeat(2_000));
+    for i in 0..warmup_iters {
+        let s = one_call(&client, &warm_text, task).await?;
+        println!(
+            "warmup {} → roundtrip {}, server {} ms",
+            i + 1,
+            fmt_ns(s.roundtrip_ns),
+            s.server_compute_ns / 1_000_000,
+        );
+    }
+    println!();
+
+    // Run each (size × concurrency) cell.
+    println!("## Results");
+    println!();
+    println!(
+        "| Size | Concurrency | n | RT P50 | RT P95 | RT P99 | RT max | RT mean | Server P50 | Server P95 | Transport P50 | Transport P95 |"
+    );
+    println!("|---|---|---|---|---|---|---|---|---|---|---|---|");
+
+    let mut all_cells: Vec<(String, usize, CellStats)> = Vec::new();
+
+    for (size_bytes, size_label) in sizes {
+        // Synthetic payload of the target size. We don't care about
+        // semantics — Jina V4's compute cost depends on tokenized
+        // length, and 'a' chars tokenize predictably without
+        // exercising any vocab edge cases.
+        let text = Arc::new("a".repeat(*size_bytes));
+
+        for concurrency in concurrencies {
+            print!(
+                "{} (B={}) × c={} → running…",
+                size_label, size_bytes, concurrency
+            );
+            let samples = run_cell(
+                client.clone(),
+                text.clone(),
+                task,
+                measured_iters,
+                concurrency,
+            )
+            .await?;
+            let stats = CellStats::from_samples(&samples);
+            println!(" done.");
+
+            println!(
+                "| {} (`{} B`) | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} |",
+                size_label,
+                size_bytes,
+                concurrency,
+                stats.n,
+                fmt_ns(stats.rt_p50_ns),
+                fmt_ns(stats.rt_p95_ns),
+                fmt_ns(stats.rt_p99_ns),
+                fmt_ns(stats.rt_max_ns),
+                fmt_ns(stats.rt_mean_ns),
+                fmt_ns(stats.server_p50_ns),
+                fmt_ns(stats.server_p95_ns),
+                fmt_ns(stats.transport_p50_ns),
+                fmt_ns(stats.transport_p95_ns),
+            );
+
+            all_cells.push((format!("{size_label} ({size_bytes} B)"), concurrency, stats));
+        }
+    }
+
+    println!();
+    println!("## OOM-test reading");
+    println!();
+    println!("**Reference points** (per `latency is the enemy of agency`):");
+    println!("- 100 ns target = harness IPC floor (decisively faster than network).");
+    println!("- 1-10 ms = industry-standard \"good\" network latency (the bar to beat by ≥1 OOM).");
+    println!("- ≥1 ms = same-OOM-as-network territory; harness loses its claimed advantage.");
+    println!();
+    let single_cells: Vec<&CellStats> = all_cells
+        .iter()
+        .filter(|(_, c, _)| *c == 1)
+        .map(|(_, _, s)| s)
+        .collect();
+    if let Some(median_cell) = single_cells.get(2) {
+        let rt_p95 = median_cell.rt_p95_ns;
+        let in_ms = (rt_p95 as f64) / 1_000_000.0;
+        let oom_vs_network = if rt_p95 > 0 {
+            (10_000_000_f64 / rt_p95 as f64).log10()
+        } else {
+            f64::INFINITY
+        };
+        println!(
+            "Median-size (~2 KB), c=1, P95 round-trip: **{}** ({:.2} ms).",
+            fmt_ns(rt_p95),
+            in_ms
+        );
+        println!("vs 10 ms network: {:.2} OOM faster.", oom_vs_network);
+        println!(
+            "vs 100 ns transport floor: {:.2} OOM slower.",
+            ((rt_p95 as f64) / 100.0).log10()
+        );
+        println!();
+        println!(
+            "**Reading:** the embedder is in the millisecond range, comparable to network latency. \
+             It is part of the harness but its compute cost puts it in the same OOM as the external \
+             environment it is supposed to beat. Ambient surfacing in Option B would have to either \
+             (a) fire rarely enough that millisecond cost amortizes, or (b) move the embedder into a \
+             topology where the compute cost is hidden behind cheaper transport (e.g. co-resident on \
+             the agent's GPU with intra-VRAM transfer)."
+        );
+    }
+
+    Ok(())
+}
diff --git a/crates/weaver-demo/src/herobench/belief.rs b/crates/weaver-demo/src/herobench/belief.rs
index d4b227e5..8578e9fb 100644
--- a/crates/weaver-demo/src/herobench/belief.rs
+++ b/crates/weaver-demo/src/herobench/belief.rs
@@ -1629,7 +1629,7 @@ pub enum HypothesisDedupError {
 /// consume attempt numbers.
 pub async fn dedup_and_upsert_attempt_hypothesis(
     pool: &ArangoPool,
-    embedder: &weaver_embedding::grpc_client::EmbeddingClient,
+    embedder: &weaver_spu::encoder::grpc_client_legacy::EmbeddingClient,
     task_name: &str,
     attempt_num: usize,
     hypothesis: &str,
diff --git a/crates/weaver-demo/src/herobench/benchmark.rs b/crates/weaver-demo/src/herobench/benchmark.rs
index 58c45fa7..185e1609 100644
--- a/crates/weaver-demo/src/herobench/benchmark.rs
+++ b/crates/weaver-demo/src/herobench/benchmark.rs
@@ -914,9 +914,9 @@ pub async fn run_benchmark(config: BenchmarkConfig) -> Result<HeroBenchResult> {
     // stays on the degraded path). This preserves benchmark viability on
     // machines without the embedder while still wiring the gate for any run
     // that has one up.
-    let embedder: Option<Arc<weaver_embedding::grpc_client::EmbeddingClient>> =
+    let embedder: Option<Arc<weaver_spu::encoder::grpc_client_legacy::EmbeddingClient>> =
         if db_pool.is_some() {
-            match weaver_embedding::grpc_client::EmbeddingClient::connect_default().await
+            match weaver_spu::encoder::grpc_client_legacy::EmbeddingClient::connect_default().await
             {
                 Ok(client) => match client.ensure_ready().await {
                     Ok(_) => Some(Arc::new(client)),
diff --git a/crates/weaver-demo/tests/herobench_integration.rs b/crates/weaver-demo/tests/herobench_integration.rs
index b0eddf61..bb929bd9 100644
--- a/crates/weaver-demo/tests/herobench_integration.rs
+++ b/crates/weaver-demo/tests/herobench_integration.rs
@@ -2294,8 +2294,8 @@ async fn integration_graph_stats() {
 //     --ignored integration_dedup
 // ---------------------------------------------------------------------------
 
-async fn try_connect_embedder() -> Option<weaver_embedding::grpc_client::EmbeddingClient> {
-    let client = match weaver_embedding::grpc_client::EmbeddingClient::connect_default().await {
+async fn try_connect_embedder() -> Option<weaver_spu::encoder::grpc_client_legacy::EmbeddingClient> {
+    let client = match weaver_spu::encoder::grpc_client_legacy::EmbeddingClient::connect_default().await {
         Ok(c) => c,
         Err(e) => {
             eprintln!("skipping: embedder not reachable ({e})");
diff --git a/crates/weaver-demo/tests/similarity_calibration.rs b/crates/weaver-demo/tests/similarity_calibration.rs
index 5f01838b..1ccb5632 100644
--- a/crates/weaver-demo/tests/similarity_calibration.rs
+++ b/crates/weaver-demo/tests/similarity_calibration.rs
@@ -24,7 +24,7 @@ use weaver_demo::herobench::similarity::{
     Classification, DUPLICATE_PAIRS, DUPLICATE_THRESHOLD, MemoryCandidate,
     NEAR_DUPLICATE_THRESHOLD, NOVEL_PAIRS, classify,
 };
-use weaver_embedding::grpc_client::EmbeddingClient;
+use weaver_spu::encoder::grpc_client_legacy::EmbeddingClient;
 
 /// Embed one hypothesis string into a candidate with no key — callers
 /// wrap the result with a key appropriate to the assertion being made.
diff --git a/crates/weaver-embedding/Cargo.toml b/crates/weaver-embedding/Cargo.toml
deleted file mode 100644
index 094b1dbf..00000000
--- a/crates/weaver-embedding/Cargo.toml
+++ /dev/null
@@ -1,24 +0,0 @@
-[package]
-name = "weaver-embedding"
-version.workspace = true
-edition.workspace = true
-rust-version.workspace = true
-license.workspace = true
-description = "Deprecated re-export shell — content folded into weaver-spu (encoder side) + weaver-core (Embedder trait) per PR-0.5.B + PR-0.5.D; removed in PR-0.5.E."
-
-[features]
-# Forward feature flags to weaver-spu so consumers that opt into
-# `gguf` (legacy GGUF encoder backend + Persephone gRPC client) via
-# this crate still get the corresponding weaver-spu feature enabled.
-# Removed in PR-0.5.E.
-default = []
-gguf = ["weaver-spu/gguf"]
-
-[dependencies]
-# Sole runtime deps: weaver-core (for the relocated `Embedder` trait
-# + types) and weaver-spu (for the relocated late_chunk / pin /
-# legacy gRPC + GGUF backends + proto module).
-# `default-features = false` on weaver-spu to avoid feature leakage —
-# this shell crate forwards features explicitly via the `gguf` flag.
-weaver-core = { workspace = true }
-weaver-spu = { workspace = true, default-features = false }
diff --git a/crates/weaver-embedding/src/lib.rs b/crates/weaver-embedding/src/lib.rs
deleted file mode 100644
index 9b80de17..00000000
--- a/crates/weaver-embedding/src/lib.rs
+++ /dev/null
@@ -1,104 +0,0 @@
-//! Deprecated re-export shell — `weaver-embedding`'s encoder-side
-//! modules folded into `weaver-spu` per `docs/specs/weaver-spu-Spec.md`
-//! PR-0.5.D.
-//!
-//! New code should import:
-//! - `Embedder` trait + types: `weaver_core::embedder::*`
-//!   (relocated in PR-0.5.B).
-//! - `late_chunk` + `LateChunkConfig` / `LateChunkResult` /
-//!   `late_chunk_embeddings`: `weaver_spu::encoder::*`.
-//! - gRPC client (`EmbeddingClient`, `EmbeddingClientConfig`,
-//!   `EmbeddingEndpoint`): `weaver_spu::encoder::grpc_client_legacy`
-//!   (always available — production embedder backend during the
-//!   migration window; retires in PR-3.A).
-//! - GGUF embedder backend: `weaver_spu::encoder::gguf_backend`
-//!   (gated by `gguf` feature; retires in PR-3.B).
-//! - Cohort-pin guard: `weaver_spu::core::pin` (relocated in
-//!   PR-0.5.D).
-//! - Persephone proto: `weaver_spu::proto::embedding::*` (always
-//!   available; retires in PR-3.A).
-//!
-//! This crate stays in the workspace through the migration window so
-//! existing consumers compile unchanged. Removed in PR-0.5.E along
-//! with `weaver-inference` and the rest of the legacy crates.
-
-// Module-scoped allow — this entire crate is a deprecated shell.
-// Every re-export below points at a stable weaver-spu or weaver-core
-// module. External consumers see deprecation warnings on their own
-// `use` lines.
-#![allow(deprecated)]
-#![doc(html_root_url = "https://docs.rs/weaver-embedding/0.1.0")]
-
-// `Embedder` trait + types (relocated in PR-0.5.B).
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_core::embedder::Embedder; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_core::embedder::Embedder;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_core::embedder::EmbedderInfo; this alias goes away in PR-0.5.E"
-)]
-pub type EmbedderInfo = weaver_core::embedder::EmbedderInfo;
-
-// Late-chunking surface (relocated in PR-0.5.D).
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::encoder::late_chunk::LateChunkConfig; this alias goes away in PR-0.5.E"
-)]
-pub type LateChunkConfig = weaver_spu::encoder::LateChunkConfig;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::encoder::late_chunk::LateChunkResult; this alias goes away in PR-0.5.E"
-)]
-pub type LateChunkResult = weaver_spu::encoder::LateChunkResult;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::encoder::late_chunk::late_chunk_embeddings; consumers should call the function via its new path"
-)]
-pub use weaver_spu::encoder::late_chunk_embeddings;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::encoder::late_chunk; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::encoder::late_chunk;
-
-// Cohort-pin guard (relocated in PR-0.5.D).
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::core::pin; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::core::pin;
-
-// gRPC client + proto (always-available; the gRPC client is the
-// production embedder backend during the migration window).
-// Retires in PR-3.A.
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::encoder::grpc_client_legacy; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::encoder::grpc_client_legacy as grpc_client;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::proto; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::proto;
-
-// GGUF backend (feature-gated; legacy in-process llama.cpp embedder
-// path; retires in PR-3.B).
-#[cfg(feature = "gguf")]
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::encoder::gguf_backend; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::encoder::gguf_backend;
-
-// Convenience top-level re-exports preserving the prior shape:
-// `use weaver_embedding::{LateChunkConfig, LateChunkResult,
-// late_chunk_embeddings};` continues to compile, but each name is
-// individually deprecated above so consumers see warnings.
diff --git a/crates/weaver-inference/Cargo.toml b/crates/weaver-inference/Cargo.toml
deleted file mode 100644
index 320cb025..00000000
--- a/crates/weaver-inference/Cargo.toml
+++ /dev/null
@@ -1,32 +0,0 @@
-[package]
-name = "weaver-inference"
-description = "Deprecated re-export shell — content folded into weaver-spu per PR-0.5.C; removed in PR-0.5.E"
-edition.workspace = true
-rust-version.workspace = true
-version.workspace = true
-license.workspace = true
-
-[features]
-# Forward feature flags to weaver-spu so consumers that opt into
-# `cuda` or `gguf` via this crate still get the corresponding
-# weaver-spu feature enabled. Removed in PR-0.5.E.
-default = []
-cuda = ["weaver-spu/cuda"]
-gguf = ["weaver-spu/gguf"]
-
-[dependencies]
-# Sole dependency: the new home of everything this crate used to host.
-# `default-features = false` prevents accidental feature leakage —
-# `weaver-spu`'s default feature set is empty today but could grow in
-# future PRs (e.g., enabling candle by default), and we don't want a
-# transient consumer of this deprecated shell to silently activate
-# heavier downstream features. This crate's `cuda` and `gguf` features
-# explicitly forward to the corresponding `weaver-spu` features
-# above, which is the only intended path.
-weaver-spu = { workspace = true, default-features = false }
-
-# Note: the binary `weaver-infer` moved to weaver-spu in PR-0.5.C
-# (now `weaver-spu/src/main.rs`). The `[[bin]]` section is gone here;
-# operators run `cargo run -p weaver-spu --bin weaver-infer`. The
-# legacy `cargo run -p weaver-inference --bin weaver-infer` no longer
-# resolves; spec PR-0.5.C calls this out.
diff --git a/crates/weaver-inference/examples/inference.toml b/crates/weaver-inference/examples/inference.toml
deleted file mode 100644
index 6c5c0120..00000000
--- a/crates/weaver-inference/examples/inference.toml
+++ /dev/null
@@ -1,35 +0,0 @@
-# weaver-inference configuration
-#
-# Usage:
-#   weaver-infer --config examples/inference.toml
-#
-# All fields except model_id have defaults.
-# CLI args override values from this file.
-
-# HuggingFace model ID
-model_id = "Qwen/Qwen2.5-3B"
-
-# Tensor parallelism degree (1 = single GPU, 2 = dual GPU)
-tp_size = 1
-
-# Explicit GPU ordinals (0-indexed CUDA device IDs).
-# Omit to auto-select. Length must match tp_size.
-#
-# Todd's workstation:
-#   0 = NVIDIA RTX A6000 (48GB, sm_86)
-#   1 = NVIDIA RTX A6000 (48GB, sm_86)
-#   2 = NVIDIA RTX 2000 Ada (16GB, sm_89)
-#
-# Examples:
-#   gpu_ordinals = [2]      # Small model on the Ada
-#   gpu_ordinals = [0, 1]   # Large model across both A6000s (tp_size = 2)
-gpu_ordinals = [2]
-
-# Maximum sequence length (prompt + generation)
-max_seq_len = 4096
-
-# Unix socket path (set to "" to disable)
-socket_path = "/run/weaver/inference.sock"
-
-# TCP listener address (uncomment for remote access)
-# listen_addr = "0.0.0.0:8080"
diff --git a/crates/weaver-inference/kernels/transformer.cu b/crates/weaver-inference/kernels/transformer.cu
deleted file mode 100644
index 9eba1359..00000000
--- a/crates/weaver-inference/kernels/transformer.cu
+++ /dev/null
@@ -1,1266 +0,0 @@
-// Transformer inference kernels for Qwen/Llama/Mistral/Gemma4 architectures.
-//
-// All kernels operate on f16 data (__half). Element-wise operations
-// accumulate in f32 internally for numerical stability.
-//
-// Each __global__ kernel has a corresponding extern "C" host-side
-// launcher function that the Rust FFI calls. The launcher computes
-// grid/block dimensions and invokes the kernel.
-//
-// Compiled to SASS for sm_86 (A6000) and sm_89 (RTX Ada) via build.rs.
-
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <math.h>
-
-// Block size for 1D kernels
-#define BLOCK_SIZE 256
-
-// ============================================================================
-// RMSNorm: x_norm = x * rsqrt(mean(x^2) + eps) * weight
-// ============================================================================
-
-static __global__ void rmsnorm_kernel(
-    const half* __restrict__ input,
-    const half* __restrict__ weight,
-    half* __restrict__ output,
-    int hidden_size,
-    float eps
-) {
-    int row = blockIdx.x;
-    int tid = threadIdx.x;
-    int stride = blockDim.x;
-
-    const half* x = input + (long long)row * hidden_size;
-    half* out = output + (long long)row * hidden_size;
-
-    extern __shared__ float sdata[];
-
-    float local_sum = 0.0f;
-    for (int i = tid; i < hidden_size; i += stride) {
-        float val = __half2float(x[i]);
-        local_sum += val * val;
-    }
-    sdata[tid] = local_sum;
-    __syncthreads();
-
-    for (int s = stride / 2; s > 0; s >>= 1) {
-        if (tid < s) {
-            sdata[tid] += sdata[tid + s];
-        }
-        __syncthreads();
-    }
-
-    float rms = rsqrtf(sdata[0] / (float)hidden_size + eps);
-
-    for (int i = tid; i < hidden_size; i += stride) {
-        float val = __half2float(x[i]) * rms;
-        out[i] = __float2half(val * __half2float(weight[i]));
-    }
-}
-
-extern "C" void launch_rmsnorm(
-    const void* input,
-    const void* weight,
-    void* output,
-    int batch_size,
-    int hidden_size,
-    float eps,
-    void* stream
-) {
-    int threads = (hidden_size < BLOCK_SIZE) ? hidden_size : BLOCK_SIZE;
-    int smem = threads * sizeof(float);
-    rmsnorm_kernel<<<batch_size, threads, smem, (cudaStream_t)stream>>>(
-        (const half*)input, (const half*)weight, (half*)output,
-        hidden_size, eps
-    );
-}
-
-// ============================================================================
-// RoPE: Rotary Position Embedding (rotary-half convention)
-//
-// Qwen/Llama/Mistral use rotary-half: pairs are (d[i], d[i + half_dim])
-// NOT interleaved pairs (d[2i], d[2i+1]).
-//
-// For each pair:
-//   new_d[i]            = d[i] * cos(angle) - d[i + half_dim] * sin(angle)
-//   new_d[i + half_dim] = d[i] * sin(angle) + d[i + half_dim] * cos(angle)
-// ============================================================================
-
-static __global__ void rope_kernel(
-    half* __restrict__ qk,      // [seq_len, num_heads, head_dim]
-    int num_heads,
-    int head_dim,
-    int pos_offset,
-    float theta_base
-) {
-    int seq_idx = blockIdx.x;
-    int head = blockIdx.y;
-    int tid = threadIdx.x;
-    int half_dim = head_dim / 2;
-
-    if (tid >= half_dim || head >= num_heads) return;
-
-    int pos = seq_idx + pos_offset;
-    float freq = 1.0f / powf(theta_base, (float)(2 * tid) / (float)head_dim);
-    float angle = (float)pos * freq;
-    float cos_val = cosf(angle);
-    float sin_val = sinf(angle);
-
-    long long base_idx = ((long long)seq_idx * num_heads + head) * head_dim;
-    float v0 = __half2float(qk[base_idx + tid]);
-    float v1 = __half2float(qk[base_idx + tid + half_dim]);
-    qk[base_idx + tid]            = __float2half(v0 * cos_val - v1 * sin_val);
-    qk[base_idx + tid + half_dim] = __float2half(v0 * sin_val + v1 * cos_val);
-}
-
-extern "C" void launch_rope(
-    void* q,
-    void* k,
-    int seq_len,
-    int num_heads,
-    int num_kv_heads,
-    int head_dim,
-    int pos_offset,
-    float theta_base,
-    void* stream
-) {
-    int threads = head_dim / 2;
-    if (threads > BLOCK_SIZE) threads = BLOCK_SIZE;
-    dim3 grid_q(seq_len, num_heads);
-    dim3 grid_k(seq_len, num_kv_heads);
-
-    rope_kernel<<<grid_q, threads, 0, (cudaStream_t)stream>>>(
-        (half*)q, num_heads, head_dim, pos_offset, theta_base
-    );
-    rope_kernel<<<grid_k, threads, 0, (cudaStream_t)stream>>>(
-        (half*)k, num_kv_heads, head_dim, pos_offset, theta_base
-    );
-}
-
-// ============================================================================
-// SwiGLU: output = silu(gate) * up, where silu(x) = x * sigmoid(x)
-// ============================================================================
-
-static __global__ void swiglu_kernel(
-    const half* __restrict__ gate,
-    const half* __restrict__ up,
-    half* __restrict__ output,
-    int total_elements
-) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= total_elements) return;
-
-    float g = __half2float(gate[idx]);
-    float u = __half2float(up[idx]);
-    float silu_g = g / (1.0f + expf(-g));
-    output[idx] = __float2half(silu_g * u);
-}
-
-extern "C" void launch_swiglu(
-    const void* gate,
-    const void* up,
-    void* output,
-    int total_elements,
-    void* stream
-) {
-    int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    swiglu_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
-        (const half*)gate, (const half*)up, (half*)output, total_elements
-    );
-}
-
-// ============================================================================
-// Residual add: output += residual (in-place)
-// ============================================================================
-
-static __global__ void residual_add_kernel(
-    half* __restrict__ output,
-    const half* __restrict__ residual,
-    int total_elements
-) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= total_elements) return;
-    float val = __half2float(output[idx]) + __half2float(residual[idx]);
-    output[idx] = __float2half(val);
-}
-
-extern "C" void launch_residual_add(
-    void* output,
-    const void* residual,
-    int total_elements,
-    void* stream
-) {
-    int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    residual_add_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
-        (half*)output, (const half*)residual, total_elements
-    );
-}
-
-// ============================================================================
-// Bias add: output[pos * dim + d] += bias[d] for all positions
-// ============================================================================
-
-static __global__ void bias_add_kernel(
-    half* __restrict__ output,
-    const half* __restrict__ bias,
-    int dim,
-    int seq_len
-) {
-    int pos = blockIdx.x;
-    int tid = threadIdx.x;
-    int stride = blockDim.x;
-
-    if (pos >= seq_len) return;
-
-    half* out = output + (long long)pos * dim;
-    for (int d = tid; d < dim; d += stride) {
-        float val = __half2float(out[d]) + __half2float(bias[d]);
-        out[d] = __float2half(val);
-    }
-}
-
-extern "C" void launch_bias_add(
-    void* output,
-    const void* bias,
-    int dim,
-    int seq_len,
-    void* stream
-) {
-    int threads = (dim < BLOCK_SIZE) ? dim : BLOCK_SIZE;
-    bias_add_kernel<<<seq_len, threads, 0, (cudaStream_t)stream>>>(
-        (half*)output, (const half*)bias, dim, seq_len
-    );
-}
-
-// ============================================================================
-// Embedding lookup: output[i] = embed_table[token_ids[i]]
-// ============================================================================
-
-static __global__ void embedding_kernel(
-    const half* __restrict__ embed_table,
-    const int* __restrict__ token_ids,
-    half* __restrict__ output,
-    int hidden_size,
-    int seq_len
-) {
-    int pos = blockIdx.x;
-    int tid = threadIdx.x;
-    int stride = blockDim.x;
-
-    if (pos >= seq_len) return;
-
-    int token_id = token_ids[pos];
-    const half* row = embed_table + (long long)token_id * hidden_size;
-    half* out = output + (long long)pos * hidden_size;
-
-    for (int i = tid; i < hidden_size; i += stride) {
-        out[i] = row[i];
-    }
-}
-
-extern "C" void launch_embedding(
-    const void* embed_table,
-    const int* token_ids,
-    void* output,
-    int hidden_size,
-    int seq_len,
-    void* stream
-) {
-    int threads = (hidden_size < BLOCK_SIZE) ? hidden_size : BLOCK_SIZE;
-    embedding_kernel<<<seq_len, threads, 0, (cudaStream_t)stream>>>(
-        (const half*)embed_table, token_ids, (half*)output,
-        hidden_size, seq_len
-    );
-}
-
-// ============================================================================
-// Causal attention (single-head, strided access for GQA)
-//
-// Q, K, V, output may be strided — heads are interleaved within each
-// position in the multi-head tensor. The stride parameters tell the
-// kernel how many elements to skip between consecutive positions.
-//
-// For prefill (seq_len > 1): full QK^T + causal mask + softmax + V
-// For decode (seq_len == 1): dot product attention against KV cache
-//
-// This is a simple implementation. For production, use FlashAttention.
-// ============================================================================
-
-static __global__ void attention_scores_kernel(
-    const half* __restrict__ q,      // [seq_len, ...] one head, strided
-    const half* __restrict__ k,      // [total_len, ...] one KV head, strided
-    float* __restrict__ scores,      // [seq_len, total_len]
-    int seq_len,
-    int total_len,
-    int head_dim,
-    int q_stride,                    // elements between positions in Q
-    int kv_stride,                   // elements between positions in K
-    float scale
-) {
-    int q_pos = blockIdx.x;
-    int tid = threadIdx.x;
-    int stride = blockDim.x;
-
-    if (q_pos >= seq_len) return;
-
-    const half* q_row = q + (long long)q_pos * q_stride;
-
-    for (int k_pos = tid; k_pos < total_len; k_pos += stride) {
-        // Causal mask: only attend to positions <= q_pos + (total_len - seq_len)
-        int q_abs = q_pos + (total_len - seq_len);
-        if (k_pos > q_abs) {
-            scores[q_pos * total_len + k_pos] = -1e9f;
-            continue;
-        }
-
-        const half* k_row = k + (long long)k_pos * kv_stride;
-        float dot = 0.0f;
-        for (int d = 0; d < head_dim; d++) {
-            dot += __half2float(q_row[d]) * __half2float(k_row[d]);
-        }
-        scores[q_pos * total_len + k_pos] = dot * scale;
-    }
-}
-
-static __global__ void softmax_kernel(
-    float* __restrict__ data,
-    int cols
-) {
-    int row = blockIdx.x;
-    int tid = threadIdx.x;
-    int stride = blockDim.x;
-    float* row_data = data + (long long)row * cols;
-
-    extern __shared__ float sdata[];
-
-    // Find max
-    float local_max = -1e30f;
-    for (int i = tid; i < cols; i += stride) {
-        if (row_data[i] > local_max) local_max = row_data[i];
-    }
-    sdata[tid] = local_max;
-    __syncthreads();
-
-    for (int s = stride / 2; s > 0; s >>= 1) {
-        if (tid < s) sdata[tid] = fmaxf(sdata[tid], sdata[tid + s]);
-        __syncthreads();
-    }
-    float max_val = sdata[0];
-
-    // Sum of exp
-    float local_sum = 0.0f;
-    for (int i = tid; i < cols; i += stride) {
-        row_data[i] = expf(row_data[i] - max_val);
-        local_sum += row_data[i];
-    }
-    sdata[tid] = local_sum;
-    __syncthreads();
-
-    for (int s = stride / 2; s > 0; s >>= 1) {
-        if (tid < s) sdata[tid] += sdata[tid + s];
-        __syncthreads();
-    }
-
-    float inv_sum = 1.0f / sdata[0];
-    for (int i = tid; i < cols; i += stride) {
-        row_data[i] *= inv_sum;
-    }
-}
-
-static __global__ void attention_output_kernel(
-    const float* __restrict__ scores, // [seq_len, total_len]
-    const half* __restrict__ v,       // [total_len, ...] one KV head, strided
-    half* __restrict__ output,        // [seq_len, ...] one head, strided
-    int seq_len,
-    int total_len,
-    int head_dim,
-    int kv_stride,                    // elements between positions in V
-    int out_stride                    // elements between positions in output
-) {
-    int q_pos = blockIdx.x;
-    int d = threadIdx.x;
-
-    if (q_pos >= seq_len || d >= head_dim) return;
-
-    const float* score_row = scores + (long long)q_pos * total_len;
-
-    float acc = 0.0f;
-    for (int k_pos = 0; k_pos < total_len; k_pos++) {
-        acc += score_row[k_pos] * __half2float(v[(long long)k_pos * kv_stride + d]);
-    }
-    output[(long long)q_pos * out_stride + d] = __float2half(acc);
-}
-
-extern "C" void launch_attention(
-    const void* q,         // [seq_len, ...] one head (strided)
-    const void* k,         // [total_len, ...] one KV head (strided)
-    const void* v,         // [total_len, ...] one KV head (strided)
-    void* output,          // [seq_len, ...] one head (strided)
-    void* scores_buf,      // [seq_len, total_len] f32 scratch
-    int seq_len,
-    int total_len,
-    int head_dim,
-    int q_stride,          // stride between positions in Q/output
-    int kv_stride,         // stride between positions in K/V
-    int out_stride,        // stride between positions in output
-    float scale,
-    void* stream
-) {
-    cudaStream_t s = (cudaStream_t)stream;
-
-    // Step 1: QK^T scores with causal mask
-    int threads1 = (total_len < BLOCK_SIZE) ? total_len : BLOCK_SIZE;
-    attention_scores_kernel<<<seq_len, threads1, 0, s>>>(
-        (const half*)q, (const half*)k, (float*)scores_buf,
-        seq_len, total_len, head_dim, q_stride, kv_stride, scale
-    );
-
-    // Step 2: Softmax
-    int threads2 = (total_len < BLOCK_SIZE) ? total_len : BLOCK_SIZE;
-    int t2 = 1;
-    while (t2 < threads2) t2 <<= 1;
-    if (t2 > BLOCK_SIZE) t2 = BLOCK_SIZE;
-    int smem = t2 * sizeof(float);
-    softmax_kernel<<<seq_len, t2, smem, s>>>(
-        (float*)scores_buf, total_len
-    );
-
-    // Step 3: Score * V
-    int threads3 = (head_dim < BLOCK_SIZE) ? head_dim : BLOCK_SIZE;
-    attention_output_kernel<<<seq_len, threads3, 0, s>>>(
-        (const float*)scores_buf, (const half*)v, (half*)output,
-        seq_len, total_len, head_dim, kv_stride, out_stride
-    );
-}
-
-// ============================================================================
-// Split fused projection output into separate contiguous buffers.
-//
-// After a fused QKV or Gate+Up GEMM, the output is [seq_len, total_dim]
-// with sub-tensors interleaved per row. This kernel deinterleaves into
-// separate contiguous output buffers.
-// ============================================================================
-
-static __global__ void split_fused_kernel(
-    const half* __restrict__ fused,   // [seq_len, total_dim]
-    half* __restrict__ out_a,         // [seq_len, dim_a]
-    half* __restrict__ out_b,         // [seq_len, dim_b]
-    half* __restrict__ out_c,         // [seq_len, dim_c] or NULL for 2-way split
-    int seq_len,
-    int total_dim,
-    int dim_a,
-    int dim_b
-) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    int total = seq_len * total_dim;
-    if (idx >= total) return;
-
-    int pos = idx / total_dim;
-    int d = idx % total_dim;
-    half val = fused[idx];
-
-    if (d < dim_a) {
-        out_a[pos * dim_a + d] = val;
-    } else if (d < dim_a + dim_b) {
-        out_b[pos * dim_b + (d - dim_a)] = val;
-    } else if (out_c) {
-        int dim_c = total_dim - dim_a - dim_b;
-        out_c[pos * dim_c + (d - dim_a - dim_b)] = val;
-    }
-}
-
-extern "C" void launch_split_fused(
-    const void* fused,
-    void* out_a,
-    void* out_b,
-    void* out_c,    // NULL for 2-way split
-    int seq_len,
-    int total_dim,
-    int dim_a,
-    int dim_b,
-    void* stream
-) {
-    int total = seq_len * total_dim;
-    int blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    split_fused_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
-        (const half*)fused,
-        (half*)out_a, (half*)out_b, (half*)out_c,
-        seq_len, total_dim, dim_a, dim_b
-    );
-}
-
-// ============================================================================
-// FlashAttention 2 — fused tiled attention with online softmax
-//
-// Replaces the 3-kernel naive pipeline (scores → softmax → output) with a
-// single kernel that tiles Q into Br-row blocks, iterates K/V in Bc-row
-// blocks, and uses online softmax (running max + sum) so the full
-// [seq_len × total_len] attention matrix is never materialized.
-//
-// Handles GQA natively: multiple Q heads share the same KV head.
-//
-// Grid:  (ceil(seq_len / BR), num_heads)
-// Block: 128 threads
-// Smem:  Q tile (16 KB) + KV tile (16 KB) + score tile (16 KB) = 48 KB
-// ============================================================================
-
-#define FA2_BR 64
-#define FA2_BC 64
-#define FA2_THREADS 128
-#define FA2_MAX_HALF_DIM 128  // head_dim / 2, supports up to head_dim=256
-
-static __global__ void __launch_bounds__(FA2_THREADS, 4)
-flash_attention_2_kernel(
-    const half* __restrict__ Q,   // [seq_len, num_heads * head_dim]
-    const half* __restrict__ K,   // [total_len, num_kv_heads * head_dim]
-    const half* __restrict__ V,   // [total_len, num_kv_heads * head_dim]
-    half* __restrict__ O,         // [seq_len, num_heads * head_dim]
-    int seq_len,
-    int total_len,
-    int num_heads,
-    int num_kv_heads,
-    int head_dim,
-    float scale,
-    int causal
-) {
-    int q_tile_idx = blockIdx.x;
-    int head = blockIdx.y;
-    int tid = threadIdx.x;
-
-    int br_start = q_tile_idx * FA2_BR;
-    int gqa_groups = num_heads / num_kv_heads;
-    int kv_head = head / gqa_groups;
-    int pos_offset = total_len - seq_len;
-
-    // Thread-to-output mapping: 2 threads per Q row, each handles half of head_dim
-    int my_row = tid % FA2_BR;           // 0..63
-    int dim_half = tid / FA2_BR;         // 0 or 1
-    int half_dim = head_dim / 2;         // 64 for hd=128
-    int dim_start = dim_half * half_dim;
-
-    // Global memory strides
-    int q_stride = num_heads * head_dim;
-    int kv_stride = num_kv_heads * head_dim;
-
-    // Shared memory: Q tile | KV tile | Score tile
-    extern __shared__ char smem_raw[];
-    half*  smem_Q  = (half*)smem_raw;
-    half*  smem_KV = smem_Q + FA2_BR * head_dim;
-    float* smem_S  = (float*)(smem_KV + FA2_BC * head_dim);
-
-    // Online softmax state (per-thread, per-row)
-    float m_val = -1e30f;
-    float l_val = 0.0f;
-
-    // Output accumulator — half_dim registers (64 for hd=128)
-    float O_acc[FA2_MAX_HALF_DIM];
-    for (int d = 0; d < half_dim; d++) {
-        O_acc[d] = 0.0f;
-    }
-
-    // ---- Load Q tile (stays resident for all KV iterations) ----
-    {
-        int total_elems = FA2_BR * head_dim;
-        for (int idx = tid; idx < total_elems; idx += FA2_THREADS) {
-            int row = idx / head_dim;
-            int d   = idx % head_dim;
-            int global_row = br_start + row;
-            if (global_row < seq_len) {
-                smem_Q[idx] = Q[(long long)global_row * q_stride + (long long)head * head_dim + d];
-            } else {
-                smem_Q[idx] = __float2half(0.0f);
-            }
-        }
-    }
-    __syncthreads();
-
-    // ---- KV block loop ----
-    int num_kv_blocks = (total_len + FA2_BC - 1) / FA2_BC;
-
-    for (int kv_blk = 0; kv_blk < num_kv_blocks; kv_blk++) {
-        int kv_start = kv_blk * FA2_BC;
-
-        // Causal early exit: all K positions in this block are beyond all Q positions
-        if (causal) {
-            int max_q_abs = br_start + FA2_BR - 1 + pos_offset;
-            if (kv_start > max_q_abs) break;
-        }
-
-        // ---- Load K block ----
-        {
-            int total_elems = FA2_BC * head_dim;
-            for (int idx = tid; idx < total_elems; idx += FA2_THREADS) {
-                int row = idx / head_dim;
-                int d   = idx % head_dim;
-                int global_row = kv_start + row;
-                if (global_row < total_len) {
-                    smem_KV[idx] = K[(long long)global_row * kv_stride + (long long)kv_head * head_dim + d];
-                } else {
-                    smem_KV[idx] = __float2half(0.0f);
-                }
-            }
-        }
-        __syncthreads();
-
-        // ---- S = Q @ K^T * scale (with causal mask) → smem_S ----
-        {
-            int total_s = FA2_BR * FA2_BC;
-            for (int idx = tid; idx < total_s; idx += FA2_THREADS) {
-                int i = idx / FA2_BC;
-                int j = idx % FA2_BC;
-
-                float dot = 0.0f;
-                for (int d = 0; d < head_dim; d++) {
-                    dot += __half2float(smem_Q[i * head_dim + d])
-                         * __half2float(smem_KV[j * head_dim + d]);
-                }
-                dot *= scale;
-
-                // Causal mask: Q at absolute position q_abs can only attend to k_abs <= q_abs
-                if (causal) {
-                    int q_abs = br_start + i + pos_offset;
-                    int k_abs = kv_start + j;
-                    if (k_abs > q_abs) dot = -1e30f;
-                }
-                // Mask out-of-bounds rows
-                if ((br_start + i) >= seq_len || (kv_start + j) >= total_len) {
-                    dot = -1e30f;
-                }
-
-                smem_S[idx] = dot;
-            }
-        }
-        __syncthreads();
-
-        // ---- Online softmax update for my_row ----
-        // Both threads sharing this row (dim_half=0,1) compute identical m/l values
-        float block_max = -1e30f;
-        for (int j = 0; j < FA2_BC; j++) {
-            float s = smem_S[my_row * FA2_BC + j];
-            if (s > block_max) block_max = s;
-        }
-
-        float m_new = fmaxf(m_val, block_max);
-        float alpha = expf(m_val - m_new);
-
-        float block_sum = 0.0f;
-        for (int j = 0; j < FA2_BC; j++) {
-            block_sum += expf(smem_S[my_row * FA2_BC + j] - m_new);
-        }
-
-        l_val = l_val * alpha + block_sum;
-        for (int d = 0; d < half_dim; d++) {
-            O_acc[d] *= alpha;
-        }
-        m_val = m_new;
-
-        __syncthreads();
-
-        // ---- Load V block (reuse smem_KV, K no longer needed) ----
-        {
-            int total_elems = FA2_BC * head_dim;
-            for (int idx = tid; idx < total_elems; idx += FA2_THREADS) {
-                int row = idx / head_dim;
-                int d   = idx % head_dim;
-                int global_row = kv_start + row;
-                if (global_row < total_len) {
-                    smem_KV[idx] = V[(long long)global_row * kv_stride + (long long)kv_head * head_dim + d];
-                } else {
-                    smem_KV[idx] = __float2half(0.0f);
-                }
-            }
-        }
-        __syncthreads();
-
-        // ---- O_acc += P @ V (P recomputed from S and m_val) ----
-        for (int j = 0; j < FA2_BC; j++) {
-            float p_val = expf(smem_S[my_row * FA2_BC + j] - m_val);
-            if (p_val > 1e-10f) {
-                for (int d = 0; d < half_dim; d++) {
-                    O_acc[d] += p_val * __half2float(smem_KV[j * head_dim + dim_start + d]);
-                }
-            }
-        }
-
-        __syncthreads();
-    }
-
-    // ---- Normalize and write output ----
-    int global_row = br_start + my_row;
-    if (global_row < seq_len) {
-        float inv_l = (l_val > 0.0f) ? (1.0f / l_val) : 0.0f;
-        long long out_base = (long long)global_row * q_stride + (long long)head * head_dim;
-        for (int d = 0; d < half_dim; d++) {
-            O[out_base + dim_start + d] = __float2half(O_acc[d] * inv_l);
-        }
-    }
-}
-
-extern "C" void launch_flash_attention(
-    const void* Q,
-    const void* K,
-    const void* V,
-    void* O,
-    int seq_len,
-    int total_len,
-    int num_heads,
-    int num_kv_heads,
-    int head_dim,
-    float scale,
-    int causal,
-    void* stream
-) {
-    int q_tiles = (seq_len + FA2_BR - 1) / FA2_BR;
-    dim3 grid(q_tiles, num_heads);
-    dim3 block(FA2_THREADS);
-
-    int smem_bytes = FA2_BR * head_dim * sizeof(half)     // smem_Q
-                   + FA2_BC * head_dim * sizeof(half)     // smem_KV
-                   + FA2_BR * FA2_BC * sizeof(float);     // smem_S
-
-    flash_attention_2_kernel<<<grid, block, smem_bytes, (cudaStream_t)stream>>>(
-        (const half*)Q, (const half*)K, (const half*)V, (half*)O,
-        seq_len, total_len, num_heads, num_kv_heads, head_dim,
-        scale, causal
-    );
-}
-
-// ============================================================================
-// Fused decode attention (seq_len=1)
-//
-// Optimized for next-token generation. Each warp independently processes a
-// stream of K/V positions — NO __syncthreads in the hot loop.
-//
-// Thread layout (dynamic, adapts to head_dim):
-//   Block size = head_dim threads (must be a multiple of 32)
-//   nwarps = head_dim / 32, dpt = head_dim / 32 (dims per thread)
-//   Each thread handles dpt dimensions: dims [lane*dpt .. lane*dpt+dpt-1]
-//   Warp w processes positions w, w+nwarps, w+2*nwarps, ... (interleaved)
-//
-// Examples: head_dim=128 → 4 warps, 4 dpt. head_dim=160 → 5 warps, 5 dpt.
-//
-// Each warp computes its own partial online-softmax result. After the loop,
-// warps combine via shared memory using the log-sum-exp trick (1 sync).
-//
-// Memory: consecutive threads read consecutive K/V elements → coalesced.
-// Supports head_dim up to 256 (DA_MAX_DPT=8).
-// ============================================================================
-
-// Max dims per thread — supports head_dim up to 512 (512/32 = 16).
-// Gemma 4 global attention layers use head_dim=512.
-#define DA_MAX_DPT 16
-
-static __global__ void decode_fused_attention_kernel(
-    const half* __restrict__ Q,   // [num_heads, head_dim]
-    const half* __restrict__ K,   // [total_len, num_kv_heads, head_dim]
-    const half* __restrict__ V,   // [total_len, num_kv_heads, head_dim]
-    half* __restrict__ O,         // [num_heads, head_dim]
-    int num_heads,
-    int num_kv_heads,
-    int head_dim,
-    int total_len,
-    float scale
-) {
-    const int head = blockIdx.x;
-    const int tid = threadIdx.x;
-    const int warp_id = tid / 32;
-    const int lane = tid & 31;
-
-    // Compute layout from head_dim (block size = head_dim)
-    const int dpt = head_dim / 32;     // dims per thread
-    const int nwarps = head_dim / 32;  // warps per block
-
-    const int gqa_groups = num_heads / num_kv_heads;
-    const int kv_head = head / gqa_groups;
-    const int kv_stride = num_kv_heads * head_dim;
-
-    // Each thread loads its dpt Q dimensions
-    float q[DA_MAX_DPT];
-    for (int i = 0; i < dpt; i++) {
-        q[i] = __half2float(Q[head * head_dim + lane * dpt + i]);
-    }
-
-    // Per-warp online softmax state (registers)
-    float m = -1e30f;
-    float l = 0.0f;
-    float acc[DA_MAX_DPT];
-    for (int i = 0; i < dpt; i++) acc[i] = 0.0f;
-
-    // Hot loop: each warp processes its own position stream
-    for (int pos = warp_id; pos < total_len; pos += nwarps) {
-        long long base = (long long)pos * kv_stride + kv_head * head_dim
-                       + lane * dpt;
-
-        // Dot product: each thread multiplies its dpt dims, warp reduces
-        float dot = 0.0f;
-        for (int i = 0; i < dpt; i++) {
-            dot += q[i] * __half2float(K[base + i]);
-        }
-        // Warp-level reduction (shuffle only — no shared mem, no sync!)
-        #pragma unroll
-        for (int offset = 16; offset > 0; offset >>= 1) {
-            dot += __shfl_xor_sync(0xFFFFFFFF, dot, offset);
-        }
-        float score = dot * scale;  // all lanes have the score
-
-        // Online softmax + V accumulation
-        float new_m = fmaxf(m, score);
-        float correction = expf(m - new_m);
-        float p = expf(score - new_m);
-
-        for (int i = 0; i < dpt; i++) {
-            acc[i] = acc[i] * correction + p * __half2float(V[base + i]);
-        }
-        l = l * correction + p;
-        m = new_m;
-    }
-
-    // === Combine warps' partial results (log-sum-exp trick) ===
-    // Dynamic shared memory layout: [nwarps floats m][nwarps floats l][nwarps*head_dim floats acc]
-    extern __shared__ char da_smem_raw[];
-    float* smem_m   = (float*)da_smem_raw;
-    float* smem_l   = smem_m + nwarps;
-    float* smem_acc = smem_l + nwarps;
-
-    if (lane == 0) {
-        smem_m[warp_id] = m;
-        smem_l[warp_id] = l;
-    }
-    for (int i = 0; i < dpt; i++) {
-        smem_acc[warp_id * head_dim + lane * dpt + i] = acc[i];
-    }
-    __syncthreads();
-
-    // Find global max across warps (all threads compute, it's cheap)
-    float global_m = smem_m[0];
-    for (int w = 1; w < nwarps; w++) {
-        global_m = fmaxf(global_m, smem_m[w]);
-    }
-
-    // Combine each warp's contribution for this thread's dimensions
-    float final_l = 0.0f;
-    float final_acc[DA_MAX_DPT];
-    for (int i = 0; i < dpt; i++) final_acc[i] = 0.0f;
-
-    for (int w = 0; w < nwarps; w++) {
-        float corr = expf(smem_m[w] - global_m);
-        final_l += smem_l[w] * corr;
-        for (int i = 0; i < dpt; i++) {
-            final_acc[i] += smem_acc[w * head_dim + lane * dpt + i] * corr;
-        }
-    }
-
-    // Write output (coalesced: consecutive threads write consecutive dims)
-    float inv_l = 1.0f / final_l;
-    for (int i = 0; i < dpt; i++) {
-        O[head * head_dim + lane * dpt + i] =
-            __float2half(final_acc[i] * inv_l);
-    }
-}
-
-extern "C" void launch_decode_attention(
-    const void* Q,
-    const void* K,
-    const void* V,
-    void* O,
-    int num_heads,
-    int num_kv_heads,
-    int head_dim,
-    int total_len,
-    float scale,
-    void* stream
-) {
-    cudaStream_t s = (cudaStream_t)stream;
-    int nwarps = head_dim / 32;
-    // Shared: nwarps floats (m) + nwarps floats (l) + nwarps*head_dim floats (acc)
-    int smem_bytes = (nwarps * 2 + nwarps * head_dim) * sizeof(float);
-
-    decode_fused_attention_kernel<<<num_heads, head_dim, smem_bytes, s>>>(
-        (const half*)Q, (const half*)K, (const half*)V, (half*)O,
-        num_heads, num_kv_heads, head_dim, total_len, scale
-    );
-}
-
-// ============================================================================
-// P2P allreduce sum (2-GPU NVLink)
-// ============================================================================
-
-static __global__ void allreduce_kernel(
-    half* __restrict__ local_buf,
-    const half* __restrict__ peer_buf,
-    int size
-) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= size) return;
-    float val = __half2float(local_buf[idx]) + __half2float(peer_buf[idx]);
-    local_buf[idx] = __float2half(val);
-}
-
-extern "C" void launch_allreduce_2gpu(
-    void* local_buf,
-    const void* peer_buf,
-    int num_elements,
-    void* stream
-) {
-    int blocks = (num_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    allreduce_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
-        (half*)local_buf, (const half*)peer_buf, num_elements
-    );
-}
-
-// ============================================================================
-// Zero buffer: output[i] = 0 for all elements
-// Used to clear MoE accumulator before expert dispatch.
-// ============================================================================
-
-static __global__ void zero_kernel(half* __restrict__ output, int total_elements) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= total_elements) return;
-    output[idx] = __float2half(0.0f);
-}
-
-extern "C" void launch_zero(
-    void* output,
-    int total_elements,
-    void* stream
-) {
-    int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    zero_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
-        (half*)output, total_elements
-    );
-}
-
-// ============================================================================
-// Fused scale-add: output[i] += scale * input[i]
-// Used to accumulate weighted expert outputs in MoE dispatch.
-// Computes in f32 internally for numerical stability.
-// ============================================================================
-
-static __global__ void scale_add_kernel(
-    half* __restrict__ output,
-    const half* __restrict__ input,
-    float scale,
-    int total_elements
-) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= total_elements) return;
-    float val = __half2float(output[idx]) + scale * __half2float(input[idx]);
-    output[idx] = __float2half(val);
-}
-
-extern "C" void launch_scale_add(
-    void* output,
-    const void* input,
-    float scale,
-    int total_elements,
-    void* stream
-) {
-    int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    scale_add_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
-        (half*)output, (const half*)input, scale, total_elements
-    );
-}
-
-// ============================================================================
-// Gather rows: dst[i, :] = src[indices[i], :] for i in 0..num_rows
-// Used to batch tokens assigned to the same expert into a contiguous buffer
-// for efficient GEMM during MoE prefill.
-// ============================================================================
-
-static __global__ void gather_rows_kernel(
-    const half* __restrict__ src,
-    half* __restrict__ dst,
-    const int* __restrict__ indices,
-    int num_rows,
-    int row_dim
-) {
-    int row = blockIdx.x;
-    if (row >= num_rows) return;
-    int src_row = indices[row];
-    int tid = threadIdx.x;
-    for (int d = tid; d < row_dim; d += blockDim.x) {
-        dst[(long long)row * row_dim + d] = src[(long long)src_row * row_dim + d];
-    }
-}
-
-extern "C" void launch_gather_rows(
-    const void* src,
-    void* dst,
-    const int* indices,
-    int num_rows,
-    int row_dim,
-    void* stream
-) {
-    if (num_rows == 0) return;
-    gather_rows_kernel<<<num_rows, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
-        (const half*)src, (half*)dst, indices, num_rows, row_dim
-    );
-}
-
-// ============================================================================
-// Weighted scatter-add: dst[indices[i], :] += weights[i] * src[i, :]
-// Used to accumulate weighted expert outputs back to their token positions
-// in mlp_out during MoE prefill. Each expert's scatter runs sequentially
-// on the same stream, so no write races between experts.
-// ============================================================================
-
-static __global__ void weighted_scatter_add_kernel(
-    half* __restrict__ dst,
-    const half* __restrict__ src,
-    const int* __restrict__ indices,
-    const float* __restrict__ weights,
-    int num_rows,
-    int row_dim
-) {
-    int row = blockIdx.x;
-    if (row >= num_rows) return;
-    int dst_row = indices[row];
-    float w = weights[row];
-    int tid = threadIdx.x;
-    for (int d = tid; d < row_dim; d += blockDim.x) {
-        float val = __half2float(dst[(long long)dst_row * row_dim + d])
-                  + w * __half2float(src[(long long)row * row_dim + d]);
-        dst[(long long)dst_row * row_dim + d] = __float2half(val);
-    }
-}
-
-extern "C" void launch_weighted_scatter_add(
-    void* dst,
-    const void* src,
-    const int* indices,
-    const float* weights,
-    int num_rows,
-    int row_dim,
-    void* stream
-) {
-    if (num_rows == 0) return;
-    weighted_scatter_add_kernel<<<num_rows, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
-        (half*)dst, (const half*)src, indices, weights, num_rows, row_dim
-    );
-}
-
-// ============================================================================
-// GeGLU: output = gelu_tanh(gate) * up   (Gemma 4 activation)
-//
-// GELU with tanh approximation:
-//   gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
-// Combined with gated unit: output = gelu(gate) * up
-// ============================================================================
-
-static __global__ void geglu_kernel(
-    const half* __restrict__ gate,
-    const half* __restrict__ up,
-    half* __restrict__ output,
-    int total_elements
-) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= total_elements) return;
-
-    float g = __half2float(gate[idx]);
-    float u = __half2float(up[idx]);
-
-    // GELU tanh approximation: 0.5 * g * (1 + tanh(sqrt(2/pi) * (g + 0.044715 * g^3)))
-    const float SQRT_2_OVER_PI = 0.7978845608f;  // sqrt(2.0 / pi)
-    float inner = SQRT_2_OVER_PI * (g + 0.044715f * g * g * g);
-    float gelu_g = 0.5f * g * (1.0f + tanhf(inner));
-
-    output[idx] = __float2half(gelu_g * u);
-}
-
-extern "C" void launch_geglu(
-    const void* gate,
-    const void* up,
-    void* output,
-    int total_elements,
-    void* stream
-) {
-    int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    geglu_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
-        (const half*)gate, (const half*)up, (half*)output, total_elements
-    );
-}
-
-// ============================================================================
-// In-place scale: data[i] *= scale
-//
-// Used for:
-//   - Embedding scaling (sqrt(hidden_size)) after lookup
-//   - Per-layer scalar multiplication
-// ============================================================================
-
-static __global__ void scale_inplace_kernel(
-    half* __restrict__ data,
-    float scale,
-    int total_elements
-) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= total_elements) return;
-    data[idx] = __float2half(__half2float(data[idx]) * scale);
-}
-
-extern "C" void launch_scale_inplace(
-    void* data,
-    float scale,
-    int total_elements,
-    void* stream
-) {
-    int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    scale_inplace_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
-        (half*)data, scale, total_elements
-    );
-}
-
-// ============================================================================
-// Logit soft-cap: logits[i] = cap * tanh(logits[i] / cap)
-//
-// Clamps logits to [-cap, +cap] range. Gemma 4 uses cap=30.0.
-// ============================================================================
-
-static __global__ void logit_softcap_kernel(
-    half* __restrict__ logits,
-    float cap,
-    int total_elements
-) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= total_elements) return;
-    float val = __half2float(logits[idx]);
-    logits[idx] = __float2half(cap * tanhf(val / cap));
-}
-
-extern "C" void launch_logit_softcap(
-    void* logits,
-    float cap,
-    int total_elements,
-    void* stream
-) {
-    int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    logit_softcap_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
-        (half*)logits, cap, total_elements
-    );
-}
-
-// ============================================================================
-// Scale-free RMSNorm (no learnable weight):
-//   output = input * rsqrt(mean(input^2) + eps)
-//
-// Used for Gemma 4 V-norm (value states) and router input normalization.
-// Same reduction as rmsnorm_kernel, but without the weight multiply.
-// ============================================================================
-
-static __global__ void rmsnorm_noscale_kernel(
-    const half* __restrict__ input,
-    half* __restrict__ output,
-    int dim,
-    float eps
-) {
-    int row = blockIdx.x;
-    int tid = threadIdx.x;
-    int stride = blockDim.x;
-
-    const half* x = input + (long long)row * dim;
-    half* out = output + (long long)row * dim;
-
-    extern __shared__ float sdata[];
-
-    float local_sum = 0.0f;
-    for (int i = tid; i < dim; i += stride) {
-        float val = __half2float(x[i]);
-        local_sum += val * val;
-    }
-    sdata[tid] = local_sum;
-    __syncthreads();
-
-    for (int s = stride / 2; s > 0; s >>= 1) {
-        if (tid < s) {
-            sdata[tid] += sdata[tid + s];
-        }
-        __syncthreads();
-    }
-
-    float rms = rsqrtf(sdata[0] / (float)dim + eps);
-
-    for (int i = tid; i < dim; i += stride) {
-        out[i] = __float2half(__half2float(x[i]) * rms);
-    }
-}
-
-extern "C" void launch_rmsnorm_noscale(
-    const void* input,
-    void* output,
-    int batch_size,
-    int dim,
-    float eps,
-    void* stream
-) {
-    int threads = (dim < BLOCK_SIZE) ? dim : BLOCK_SIZE;
-    int smem = threads * sizeof(float);
-    rmsnorm_noscale_kernel<<<batch_size, threads, smem, (cudaStream_t)stream>>>(
-        (const half*)input, (half*)output, dim, eps
-    );
-}
-
-// ============================================================================
-// Partial RoPE: Apply rotary embeddings to only the first rotary_dim
-// dimensions of each head, leaving the remaining dimensions unchanged.
-//
-// Gemma 4 global attention layers use partial_rotary_factor=0.25, meaning
-// only 128 out of 512 head dimensions get rotary encoding.
-//
-// Uses the same rotary-half convention as launch_rope:
-//   pairs are (d[i], d[i + half_rotary]) for i in 0..half_rotary
-// ============================================================================
-
-static __global__ void rope_partial_kernel(
-    half* __restrict__ qk,      // [seq_len, num_heads, head_dim]
-    int num_heads,
-    int head_dim,
-    int rotary_dim,             // first rotary_dim dims get RoPE
-    int pos_offset,
-    float theta_base
-) {
-    int seq_idx = blockIdx.x;
-    int head = blockIdx.y;
-    int tid = threadIdx.x;
-    int half_rotary = rotary_dim / 2;
-
-    if (tid >= half_rotary || head >= num_heads) return;
-
-    int pos = seq_idx + pos_offset;
-    float freq = 1.0f / powf(theta_base, (float)(2 * tid) / (float)rotary_dim);
-    float angle = (float)pos * freq;
-    float cos_val = cosf(angle);
-    float sin_val = sinf(angle);
-
-    long long base_idx = ((long long)seq_idx * num_heads + head) * head_dim;
-    float v0 = __half2float(qk[base_idx + tid]);
-    float v1 = __half2float(qk[base_idx + tid + half_rotary]);
-    qk[base_idx + tid]               = __float2half(v0 * cos_val - v1 * sin_val);
-    qk[base_idx + tid + half_rotary] = __float2half(v0 * sin_val + v1 * cos_val);
-    // Dimensions [rotary_dim, head_dim) are untouched
-}
-
-extern "C" void launch_rope_partial(
-    void* q,
-    void* k,
-    int seq_len,
-    int num_heads,
-    int num_kv_heads,
-    int head_dim,
-    int rotary_dim,
-    int pos_offset,
-    float theta_base,
-    void* stream
-) {
-    int threads = rotary_dim / 2;
-    if (threads > BLOCK_SIZE) threads = BLOCK_SIZE;
-    dim3 grid_q(seq_len, num_heads);
-    dim3 grid_k(seq_len, num_kv_heads);
-
-    rope_partial_kernel<<<grid_q, threads, 0, (cudaStream_t)stream>>>(
-        (half*)q, num_heads, head_dim, rotary_dim, pos_offset, theta_base
-    );
-    rope_partial_kernel<<<grid_k, threads, 0, (cudaStream_t)stream>>>(
-        (half*)k, num_kv_heads, head_dim, rotary_dim, pos_offset, theta_base
-    );
-}
diff --git a/crates/weaver-inference/src/lib.rs b/crates/weaver-inference/src/lib.rs
deleted file mode 100644
index 5c9e8f52..00000000
--- a/crates/weaver-inference/src/lib.rs
+++ /dev/null
@@ -1,128 +0,0 @@
-//! Deprecated re-export shell — `weaver-inference`'s decoder-side
-//! modules folded into `weaver-spu` per `docs/specs/weaver-spu-Spec.md`
-//! PR-0.5.C.
-//!
-//! New code should import from `weaver_spu::decoder::*` (decoder
-//! modules) or `weaver_spu::core::*` (orchestrator, vram, nvlink,
-//! probe, gpu primitives).
-//!
-//! This crate stays in the workspace through the migration window so
-//! existing consumers compile unchanged. Removed in PR-0.5.E along
-//! with `weaver-embedding` and the rest of the legacy crates.
-
-// Module-scoped allow — this entire crate is a deprecated shell.
-// Every re-export below points at a stable weaver-spu module and
-// would otherwise need an individual `#[allow(deprecated)]`. The
-// crate retires in PR-0.5.E. External consumers see deprecation
-// warnings on their own `use` lines.
-#![allow(deprecated)]
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::decoder::config; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::decoder::config;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::decoder::download; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::decoder::download;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::decoder::engine; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::decoder::engine;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::decoder::family; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::decoder::family;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "renamed to weaver_spu::decoder::llama_cpp_backend; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::decoder::llama_cpp_backend as backend;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::decoder::model; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::decoder::model;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::decoder::model_profile; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::decoder::model_profile;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::decoder::multi_model; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::decoder::multi_model;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::decoder::runtime_config; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::decoder::runtime_config;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::decoder::server; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::decoder::server;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::decoder::startup; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::decoder::startup;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::decoder::suggest; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::decoder::suggest;
-
-#[cfg(feature = "gguf")]
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::decoder::gguf; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::decoder::gguf;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::core::gpu_orchestrator; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::core::gpu_orchestrator;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::core::nvlink; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::core::nvlink;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::core::probe; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::core::probe;
-
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::core::vram; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::core::vram;
-
-#[cfg(feature = "cuda")]
-#[deprecated(
-    since = "0.1.0",
-    note = "moved to weaver_spu::core::gpu; this re-export goes away in PR-0.5.E"
-)]
-pub use weaver_spu::core::gpu;
diff --git a/crates/weaver-interface/Cargo.toml b/crates/weaver-interface/Cargo.toml
index 2e2d2516..de003827 100644
--- a/crates/weaver-interface/Cargo.toml
+++ b/crates/weaver-interface/Cargo.toml
@@ -15,7 +15,12 @@ path = "src/main.rs"
 
 [features]
 default = []
-inference = ["dep:weaver-inference"]
+# `inference` opts the daemon into the candle decoder + GGUF/CUDA
+# backends (post-PR-0.5.E this maps onto weaver-spu's `cuda` and
+# `gguf` features). Pre-PR-0.5.E this enabled `dep:weaver-inference`
+# which is now removed; the feature now activates the corresponding
+# weaver-spu features instead.
+inference = ["weaver-spu/cuda", "weaver-spu/gguf"]
 # `viewer` pulls in weaver-analysis (manifest schema + load) so the
 # trace viewer can ingest the bench-emitted manifest.json companion.
 viewer = ["dep:ratatui", "dep:crossterm", "dep:weaver-analysis"]
@@ -25,9 +30,11 @@ weaver-core = { workspace = true }
 weaver-database = { path = "../weaver-database" }
 weaver-demo = { path = "../weaver-demo" }
 weaver-trace = { path = "../weaver-trace" }
-weaver-embedding = { workspace = true }
-
-weaver-inference = { path = "../weaver-inference", features = ["cuda", "gguf"], optional = true }
+# `weaver-spu` carries the decoder + encoder runtime post-PR-0.5.C/D/E.
+# The `inference` feature above forwards `cuda` and `gguf` features
+# through to `weaver-spu`. `default-features = false` per the
+# workspace dep declaration.
+weaver-spu = { workspace = true }
 anyhow = { workspace = true }
 async-trait = { workspace = true }
 chrono = { workspace = true }
diff --git a/crates/weaver-interface/src/harness.rs b/crates/weaver-interface/src/harness.rs
index 7f8736e9..720c4003 100644
--- a/crates/weaver-interface/src/harness.rs
+++ b/crates/weaver-interface/src/harness.rs
@@ -6,7 +6,7 @@
 //!
 //! Today this is a single subcommand tree (`embedder`) for inspecting and
 //! managing the embedder identity pin written by `weaver serve` on first
-//! boot. See `weaver_embedding::pin` for the lock-file design.
+//! boot. See `weaver_spu::core::pin` for the lock-file design.
 //!
 //! The commands here do not touch the running embedder or GPU state — they
 //! only read and write `/opt/weaver/state/embedder.pin.json`. Operators use
@@ -18,7 +18,7 @@ use std::path::{Path, PathBuf};
 use anyhow::{Context, Result, bail};
 use clap::{Args, Subcommand};
 
-use weaver_embedding::pin::{DEFAULT_PIN_PATH, EmbedderPin, WriteIfAbsent};
+use weaver_spu::core::pin::{DEFAULT_PIN_PATH, EmbedderPin, WriteIfAbsent};
 
 #[derive(Subcommand, Debug)]
 pub enum HarnessAction {
diff --git a/crates/weaver-interface/src/lib.rs b/crates/weaver-interface/src/lib.rs
index 701c31ca..1d01959b 100644
--- a/crates/weaver-interface/src/lib.rs
+++ b/crates/weaver-interface/src/lib.rs
@@ -6,11 +6,13 @@
 
 pub mod auth;
 pub mod daemon_client;
-// `embedder_pin` moved to `weaver-embedding::pin` per
-// `embedder-oxidization-Spec.md` §5.1 (issue #166 / sprint
-// Block A.4). Cohort-pin is an embedding-substrate concern, not
-// a CLI concern; it lives alongside the loader paths it guards.
-// External consumers import from `weaver_embedding::pin` now.
+// `embedder_pin` lives at `weaver_spu::core::pin` per
+// `docs/specs/weaver-spu-Spec.md` PR-0.5.D (relocated there from
+// the prior `weaver-embedding::pin` home in the original
+// embedder-oxidization sprint). Cohort-pin is an embedding-
+// substrate concern, not a CLI concern; it lives alongside the
+// loader paths it guards. External consumers import from
+// `weaver_spu::core::pin`.
 pub mod kinds;
 pub mod protocol;
 pub mod server;
diff --git a/crates/weaver-interface/src/model.rs b/crates/weaver-interface/src/model.rs
index fbbfe638..09db3ef4 100644
--- a/crates/weaver-interface/src/model.rs
+++ b/crates/weaver-interface/src/model.rs
@@ -241,7 +241,7 @@ async fn handle_load(
 #[cfg(feature = "inference")]
 fn handle_probe(path: String) -> Result<()> {
     use std::path::Path;
-    use weaver_inference::probe::probe_gguf;
+    use weaver_spu::core::probe::probe_gguf;
 
     let probe = probe_gguf(Path::new(&path))?;
 
@@ -304,7 +304,7 @@ fn handle_probe(path: String) -> Result<()> {
 
 #[cfg(feature = "inference")]
 fn handle_families() -> Result<()> {
-    use weaver_inference::family::{ModelMode, supported_families};
+    use weaver_spu::decoder::family::{ModelMode, supported_families};
 
     let families = supported_families();
     if families.is_empty() {
@@ -342,7 +342,7 @@ fn handle_families() -> Result<()> {
 #[cfg(feature = "inference")]
 fn handle_registry(config: String) -> Result<()> {
     use std::path::Path;
-    use weaver_inference::multi_model::MultiModelConfig;
+    use weaver_spu::decoder::multi_model::MultiModelConfig;
 
     let path = Path::new(&config);
     if !path.exists() {
@@ -396,9 +396,9 @@ async fn handle_install(
     socket: String,
 ) -> Result<()> {
     use std::path::PathBuf;
-    use weaver_inference::download::download_gguf_file;
-    use weaver_inference::probe::probe_gguf;
-    use weaver_inference::suggest::suggest_entry;
+    use weaver_spu::decoder::download::download_gguf_file;
+    use weaver_spu::core::probe::probe_gguf;
+    use weaver_spu::decoder::suggest::suggest_entry;
 
     use crate::toml_edit::append_model_entry;
 
@@ -465,7 +465,7 @@ async fn handle_install(
 #[cfg(feature = "inference")]
 fn resolve_entry_name(
     explicit: Option<&str>,
-    probe: &weaver_inference::probe::GgufProbe,
+    probe: &weaver_spu::core::probe::GgufProbe,
     filename: &str,
 ) -> String {
     if let Some(n) = explicit {
@@ -487,9 +487,9 @@ fn resolve_entry_name(
 #[cfg(feature = "inference")]
 async fn load_after_install(
     socket: &str,
-    entry: &weaver_inference::multi_model::ModelEntry,
+    entry: &weaver_spu::decoder::multi_model::ModelEntry,
 ) -> Result<()> {
-    use weaver_inference::multi_model::GpuSpec;
+    use weaver_spu::decoder::multi_model::GpuSpec;
 
     let provider = OpenAiProvider::unix(socket, 120)?;
 
@@ -754,7 +754,7 @@ mod tests {
     #[cfg(feature = "inference")]
     #[test]
     fn resolve_entry_name_prefers_explicit() {
-        let probe = weaver_inference::probe::GgufProbe {
+        let probe = weaver_spu::core::probe::GgufProbe {
             gguf_version: 3,
             arch: "qwen2".to_string(),
             name: Some("Qwen Coder 32B".to_string()),
@@ -771,7 +771,7 @@ mod tests {
     #[cfg(feature = "inference")]
     #[test]
     fn resolve_entry_name_uses_probe_name_when_available() {
-        let probe = weaver_inference::probe::GgufProbe {
+        let probe = weaver_spu::core::probe::GgufProbe {
             gguf_version: 3,
             arch: "qwen2".to_string(),
             name: Some("qwen25-coder".to_string()),
@@ -788,7 +788,7 @@ mod tests {
     #[cfg(feature = "inference")]
     #[test]
     fn resolve_entry_name_falls_back_to_filename_stem() {
-        let probe = weaver_inference::probe::GgufProbe {
+        let probe = weaver_spu::core::probe::GgufProbe {
             gguf_version: 3,
             arch: "qwen2".to_string(),
             name: None,
@@ -805,7 +805,7 @@ mod tests {
     #[cfg(feature = "inference")]
     #[test]
     fn resolve_entry_name_ignores_whitespace_only_probe_name() {
-        let probe = weaver_inference::probe::GgufProbe {
+        let probe = weaver_spu::core::probe::GgufProbe {
             gguf_version: 3,
             arch: "qwen2".to_string(),
             name: Some("   ".to_string()),
diff --git a/crates/weaver-interface/src/serve.rs b/crates/weaver-interface/src/serve.rs
index 85b02d18..e1cd69f4 100644
--- a/crates/weaver-interface/src/serve.rs
+++ b/crates/weaver-interface/src/serve.rs
@@ -10,11 +10,11 @@ use std::time::Duration;
 
 use anyhow::{Result, bail};
 use clap::Args;
-use weaver_embedding::grpc_client::{
+use weaver_spu::encoder::grpc_client_legacy::{
     DEFAULT_EMBEDDER_SOCKET, EmbeddingClient, EmbeddingClientConfig, EmbeddingEndpoint,
 };
 
-use weaver_embedding::pin::{self, DEFAULT_PIN_PATH, EmbedderPin, VerifyOutcome};
+use weaver_spu::core::pin::{self, DEFAULT_PIN_PATH, EmbedderPin, VerifyOutcome};
 const EMBEDDER_PROBE_TIMEOUT: Duration = Duration::from_secs(3);
 /// Total wall-clock budget for establishing an embedder probe. Under systemd,
 /// weaver-embedder.service is Type=simple and declared active the moment
@@ -45,9 +45,9 @@ pub struct ServeArgs {
 pub async fn handle_serve(args: ServeArgs) -> Result<()> {
     // Load config or use empty defaults (server starts with no models)
     let mut config = if let Some(ref path) = args.config {
-        weaver_inference::multi_model::MultiModelConfig::from_file(path)?
+        weaver_spu::decoder::multi_model::MultiModelConfig::from_file(path)?
     } else {
-        weaver_inference::multi_model::MultiModelConfig::default()
+        weaver_spu::decoder::multi_model::MultiModelConfig::default()
     };
 
     // Apply CLI overrides
@@ -64,7 +64,7 @@ pub async fn handle_serve(args: ServeArgs) -> Result<()> {
     }
 
     // Build server state (loads any models declared in config)
-    let (state, loaded_names) = weaver_inference::startup::build_server_state(&config)?;
+    let (state, loaded_names) = weaver_spu::decoder::startup::build_server_state(&config)?;
 
     let model_count = loaded_names.len();
     if model_count > 0 {
@@ -143,7 +143,7 @@ pub async fn handle_serve(args: ServeArgs) -> Result<()> {
         }
     }
 
-    weaver_inference::server::serve(
+    weaver_spu::decoder::server::serve(
         state,
         Some(socket_path.as_path()),
         listen_addr.as_deref(),
@@ -166,7 +166,7 @@ pub async fn handle_serve(args: ServeArgs) -> Result<()> {
 /// Once we're past startup, the refuse-to-start-when-pin-exists check in
 /// `handle_serve` catches a truly-wedged embedder: probe returns None →
 /// refuse, rather than silently running without the pin verified.
-async fn probe_embedder(socket: &str) -> Option<weaver_embedding::proto::embedding::InfoResponse> {
+async fn probe_embedder(socket: &str) -> Option<weaver_spu::proto::embedding::InfoResponse> {
     let client_config = EmbeddingClientConfig {
         endpoint: EmbeddingEndpoint::Unix(PathBuf::from(socket)),
         connect_timeout: EMBEDDER_PROBE_TIMEOUT,
@@ -215,7 +215,7 @@ async fn probe_embedder(socket: &str) -> Option<weaver_embedding::proto::embeddi
 /// by the pinned embedder and mixing families silently corrupts similarity.
 fn verify_embedder_pin(
     path: &Path,
-    info: &weaver_embedding::proto::embedding::InfoResponse,
+    info: &weaver_spu::proto::embedding::InfoResponse,
 ) -> Result<()> {
     let live = EmbedderPin::now(
         &info.model_name,
@@ -256,7 +256,7 @@ mod tests {
     use super::*;
     use clap::Parser;
     use tempfile::TempDir;
-    use weaver_embedding::proto::embedding::InfoResponse;
+    use weaver_spu::proto::embedding::InfoResponse;
 
     #[derive(Parser, Debug)]
     struct TestCli {
diff --git a/crates/weaver-interface/src/toml_edit.rs b/crates/weaver-interface/src/toml_edit.rs
index 65cdb815..12aeef53 100644
--- a/crates/weaver-interface/src/toml_edit.rs
+++ b/crates/weaver-interface/src/toml_edit.rs
@@ -22,7 +22,7 @@ use std::path::{Path, PathBuf};
 use std::time::{SystemTime, UNIX_EPOCH};
 
 use serde::Serialize;
-use weaver_inference::multi_model::{ModelEntry, MultiModelConfig};
+use weaver_spu::decoder::multi_model::{ModelEntry, MultiModelConfig};
 
 /// Result of [`append_model_entry`]: the path written plus the block of TOML
 /// text that was appended (useful for CLI output).
@@ -224,8 +224,8 @@ fn atomic_write(path: &Path, bytes: &[u8]) -> anyhow::Result<()> {
 mod tests {
     use super::*;
     use tempfile::TempDir;
-    use weaver_inference::family::{ModelMode, Pooling};
-    use weaver_inference::multi_model::{BackendKind, GpuSpec};
+    use weaver_spu::decoder::family::{ModelMode, Pooling};
+    use weaver_spu::decoder::multi_model::{BackendKind, GpuSpec};
 
     fn gguf_entry(name: &str, path: &str) -> ModelEntry {
         ModelEntry {
diff --git a/crates/weaver-spu/src/decoder/server/prompt.rs b/crates/weaver-spu/src/decoder/server/prompt.rs
index 137b9040..58b898ef 100644
--- a/crates/weaver-spu/src/decoder/server/prompt.rs
+++ b/crates/weaver-spu/src/decoder/server/prompt.rs
@@ -1,3 +1,10 @@
+// Module-scoped allow — pre-existing hidden-lifetime patterns in
+// this file surface under weaver-spu's `#![warn(rust_2018_idioms)]`
+// (downgraded from `deny` in PR-0.5.C when the file was folded in
+// from weaver-inference). Fixing them is a future cleanup PR; out
+// of scope for the no-op consolidation in PR-0.5.E.
+#![allow(elided_lifetimes_in_paths)]
+
 //! Prompt building and tool-call parsing — pure functions, no CUDA dependency.
 //!
 //! These functions convert OpenAI-format messages into model-specific prompt
diff --git a/crates/weaver-spu/src/encoder/gguf_backend.rs b/crates/weaver-spu/src/encoder/gguf_backend.rs
index 2999e078..4df01d96 100644
--- a/crates/weaver-spu/src/encoder/gguf_backend.rs
+++ b/crates/weaver-spu/src/encoder/gguf_backend.rs
@@ -234,17 +234,15 @@ fn l2_normalize(v: &[f32]) -> Vec<f32> {
 #[cfg(test)]
 mod tests {
     use super::*;
-    // Tests load a real GGUF via `weaver_inference`'s decoder
-    // model-loader because that's where `init_backend` / `load_model`
-    // / `GgufModelParams` live (per the post-Phase-0 ontology:
-    // model-runtime primitives stay in `weaver_inference`;
-    // embedding-pipeline composition lives here in
-    // `weaver_embedding`). `weaver-inference` is therefore a
-    // dev-dependency of `weaver-embedding`. A later PR (Phase 1
-    // — the `EmbedderClient` impl) makes it a regular dependency
-    // when the production embedder code calls `load_model` itself.
+    // Tests load a real GGUF via the decoder-side model-loader
+    // primitives (`init_backend` / `load_model` / `GgufModelParams`
+    // in `weaver_spu::decoder::gguf`). Both halves of the SPU now
+    // live in this crate, so this is a sibling-module import — no
+    // cross-crate dev-dep wiring needed. The legacy comment about
+    // `weaver-inference` and `weaver-embedding` being separate
+    // crates is gone with PR-0.5.E.
     use std::sync::Mutex;
-    use weaver_inference::gguf::{GgufModelParams, init_backend, load_model};
+    use weaver_spu::decoder::gguf::{GgufModelParams, init_backend, load_model};
 
     const JINA_RETRIEVAL_Q6K: &str =
         "/opt/weaver/models/jina-embeddings-v4-text-retrieval-Q6_K.gguf";