diff --git a/Cargo.lock b/Cargo.lock index 7df7802d..a0a51631 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4142,23 +4142,8 @@ dependencies = [ "weaver-analysis", "weaver-core", "weaver-database", - "weaver-embedding", - "weaver-trace", -] - -[[package]] -name = "weaver-embedding" -version = "0.1.0" -dependencies = [ - "weaver-core", - "weaver-spu", -] - -[[package]] -name = "weaver-inference" -version = "0.1.0" -dependencies = [ "weaver-spu", + "weaver-trace", ] [[package]] @@ -4188,8 +4173,7 @@ dependencies = [ "weaver-core", "weaver-database", "weaver-demo", - "weaver-embedding", - "weaver-inference", + "weaver-spu", "weaver-trace", ] diff --git a/Cargo.toml b/Cargo.toml index c982a4b2..94d499e0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,12 +3,10 @@ resolver = "2" members = [ "crates/weaver-core", "crates/weaver-interface", - "crates/weaver-inference", "crates/weaver-database", "crates/weaver-demo", "crates/weaver-trace", "crates/weaver-analysis", - "crates/weaver-embedding", "crates/weaver-spu", ] @@ -23,8 +21,10 @@ license = "Apache-2.0" weaver-core = { path = "crates/weaver-core" } weaver-trace = { path = "crates/weaver-trace" } weaver-analysis = { path = "crates/weaver-analysis" } -weaver-embedding = { path = "crates/weaver-embedding" } -weaver-inference = { path = "crates/weaver-inference" } +# `weaver-embedding` and `weaver-inference` were deprecated re-export +# shells removed in PR-0.5.E. Their content folded into `weaver-spu` +# (encoder + decoder runtime) and `weaver-core` (Embedder trait). +# Workspace member count contracted from 9 → 7 per spec §6. # `default-features = false` at the workspace level so consumers can # explicitly opt out of weaver-spu's default feature set without # tripping cargo's "can't override workspace defaults" rule. Today diff --git a/crates/weaver-core/src/embedder.rs b/crates/weaver-core/src/embedder.rs index 5221c984..a7a768ef 100644 --- a/crates/weaver-core/src/embedder.rs +++ b/crates/weaver-core/src/embedder.rs @@ -7,7 +7,7 @@ //! lives: //! //! - **Pre-Phase-1 cutover**: gRPC client to the Python `weaver-embedder.service` -//! in `weaver-embedding::grpc_client::EmbeddingClient`. +//! in `weaver_spu::encoder::grpc_client_legacy::EmbeddingClient`. //! - **Post-Phase-1 cutover**: in-process candle backend //! `weaver-spu::encoder::client::EmbedderClient`. //! @@ -39,7 +39,7 @@ //! `Box` payload, so this crate doesn't need a //! tonic dep just to express gRPC failures. Backend impls map their //! concrete transport errors into the abstract `Transport` variant via -//! `From` impls (see `weaver-embedding::grpc_client`). +//! `From` impls (see `weaver_spu::encoder::grpc_client_legacy`). use async_trait::async_trait; @@ -107,7 +107,7 @@ pub struct LateChunkResult { /// /// Variants are deliberately abstract over transport so this crate /// doesn't pick up a tonic dep just to express gRPC errors. Backend -/// implementations (e.g., the gRPC client in `weaver-embedding`) map +/// implementations (e.g., the gRPC client in `weaver_spu::encoder::grpc_client_legacy`) map /// their concrete transport errors into [`Self::Transport`] via `From` /// impls and the message preserves the upstream error's `Display`. #[derive(Debug, thiserror::Error)] @@ -130,12 +130,12 @@ pub enum EmbeddingError { } // gRPC-transport `From` impls. These live here (not in -// `weaver-embedding`) because Rust's orphan rule requires foreign-on- +// `weaver_spu::encoder::grpc_client_legacy`) because Rust's orphan rule requires foreign-on- // foreign impls to be in the trait's crate. The cost is one tonic dep // in `weaver-core`; the alternative (explicit `.map_err` at every `?` // site in the gRPC client) is a larger maintenance burden than the // dep. Removed in PR-3.A along with the gRPC client itself, when the -// last consumer (weaver-embedding) goes away. +// last consumer (`weaver_spu::encoder::grpc_client_legacy`) goes away in PR-3.A. impl From for EmbeddingError { fn from(e: tonic::transport::Error) -> Self { EmbeddingError::Transport(Box::new(e)) @@ -151,7 +151,7 @@ impl From for EmbeddingError { /// Backend-agnostic embedding service. /// /// Implementations: -/// - `weaver_embedding::grpc_client::EmbeddingClient` — Python service over +/// - `weaver_spu::encoder::grpc_client_legacy::EmbeddingClient` — Python service over /// Unix socket / TCP gRPC (pre-Phase-1 cutover). /// - `weaver_spu::encoder::client::EmbedderClient` — Rust in-process /// candle backend (post-Phase-1 cutover). diff --git a/crates/weaver-database/src/chunking/mod.rs b/crates/weaver-database/src/chunking/mod.rs index 270cc782..a43e71e4 100644 --- a/crates/weaver-database/src/chunking/mod.rs +++ b/crates/weaver-database/src/chunking/mod.rs @@ -12,7 +12,7 @@ //! belongs alongside the rest of the embedding pipeline. Consumers //! that need `LateChunkConfig` / `LateChunkResult` / //! `late_chunk_embeddings` import from -//! `weaver_embedding::late_chunk::*` instead. +//! `weaver_spu::encoder::late_chunk::*` instead. mod strategies; diff --git a/crates/weaver-database/src/lib.rs b/crates/weaver-database/src/lib.rs index 9fc12c34..65debd5d 100644 --- a/crates/weaver-database/src/lib.rs +++ b/crates/weaver-database/src/lib.rs @@ -8,7 +8,7 @@ pub mod graph; // `weaver-embedding::grpc_client` per // `embedder-oxidization-Spec.md` §5.1 (issue #166 / sprint // Block A.5). External consumers import from -// `weaver_embedding::grpc_client` and `weaver_embedding::proto::embedding`. +// `weaver_spu::encoder::grpc_client_legacy` and `weaver_spu::proto::embedding`. // `weaver-database`'s `proto` module continues to generate // embedding-service types for backward compat with internal // proto-validation tests; a future cleanup PR removes the diff --git a/crates/weaver-demo/Cargo.toml b/crates/weaver-demo/Cargo.toml index 4783ef41..214e6da1 100644 --- a/crates/weaver-demo/Cargo.toml +++ b/crates/weaver-demo/Cargo.toml @@ -8,7 +8,10 @@ license.workspace = true [dependencies] weaver-core = { workspace = true } weaver-database = { workspace = true } -weaver-embedding = { workspace = true } +# `weaver-embedding` removed in PR-0.5.E. Demo uses the relocated +# encoder types via `weaver-spu` directly + the relocated `Embedder` +# trait via `weaver-core::embedder`. +weaver-spu = { workspace = true } weaver-trace = { workspace = true } weaver-analysis = { workspace = true } async-trait = { workspace = true } diff --git a/crates/weaver-demo/src/bin/embedder_latency.rs b/crates/weaver-demo/src/bin/embedder_latency.rs new file mode 100644 index 00000000..e1852f57 --- /dev/null +++ b/crates/weaver-demo/src/bin/embedder_latency.rs @@ -0,0 +1,381 @@ +// Pre-existing hidden-lifetime pattern at line 345 — not introduced +// by PR-0.5.E (the binary uses gRPC types whose generic lifetimes +// rustc began flagging more strictly). Out-of-scope cleanup for the +// no-op consolidation; allowed at file scope. +#![allow(elided_lifetimes_in_paths)] + +//! Embedder latency characterization for Option B (SuperEgo / ambient +//! surfacing) feasibility. +//! +//! The "latency is the enemy of agency" thesis says the harness's +//! internal-channel speed must beat the agent's external environment +//! by ≥1 OOM. The user's calibration: external = network = 1-10 ms; +//! the harness's transport floor target is the 100 ns range. We are +//! 4-5 OOM faster on transport-only IPC; what we measure here is +//! whether the embedder, which is part of the inner loop for +//! ambient surfacing, blows that budget. +//! +//! Two axes that matter: +//! +//! - **Input size sweep** — real `output.value` payloads from +//! benchmark traces are small (median 2 KB, max ~5 KB). We sweep +//! ~50 B → 32 KB to capture the realistic range plus the upper +//! tail. +//! - **Concurrency sweep** — Option B fires the embedder on every +//! tool output, and a twin-GPU bench has both agents firing into +//! the same embedder. Measuring 1 vs 2 vs 4 simultaneous requests +//! shows whether the current GPU2 (RTX 2000 75W) placement +//! collapses under contention. +//! +//! Output: a markdown table per (size × concurrency) cell with +//! P50 / P95 / P99 / max round-trip in nanoseconds, plus the +//! transport delta (round-trip minus server-reported compute) so +//! the IPC overhead is separable from the GPU cost. +//! +//! Per-call data is captured in Unix nanoseconds from +//! `SystemTime::UNIX_EPOCH` at request emit and response receive, +//! so future runs can be cross-correlated with packet captures or +//! CUDA-side instrumentation if/when we add per-stage profiling on +//! the Python embedder. + +use std::env; +use std::path::PathBuf; +use std::sync::Arc; +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; + +use tokio::task::JoinSet; + +use weaver_spu::encoder::grpc_client_legacy::{EmbeddingClient, EmbeddingClientConfig, EmbeddingEndpoint}; + +/// Single measured call. +struct Sample { + /// Wall-clock at send (Unix nanoseconds). Captured for future + /// cross-process correlation; not consumed by the aggregate + /// stats emitted in this run. + #[allow(dead_code)] + t_send_unix_ns: u128, + /// Wall-clock at receive (Unix nanoseconds). Same rationale as + /// `t_send_unix_ns`. + #[allow(dead_code)] + t_recv_unix_ns: u128, + /// Round-trip from `Instant::now()` deltas — monotonic, immune to + /// wall-clock jumps. This is the load-bearing measurement. + roundtrip_ns: u128, + /// Server-reported compute time. Service still uses ms; converted + /// to ns at the boundary so all subsequent math is in one unit. + server_compute_ns: u128, + /// Round-trip minus server compute = IPC + serialization + + /// scheduler overhead. Negative values are clamped to 0 (the + /// server's `duration_ms` is integer-rounded so 0 ms server with + /// any positive client roundtrip still yields a sensible delta). + transport_overhead_ns: u128, +} + +/// Aggregated stats for a (size × concurrency) cell. +struct CellStats { + n: usize, + rt_p50_ns: u128, + rt_p95_ns: u128, + rt_p99_ns: u128, + rt_max_ns: u128, + rt_mean_ns: u128, + server_p50_ns: u128, + server_p95_ns: u128, + transport_p50_ns: u128, + transport_p95_ns: u128, +} + +impl CellStats { + fn from_samples(samples: &[Sample]) -> Self { + let mut rt: Vec = samples.iter().map(|s| s.roundtrip_ns).collect(); + let mut server: Vec = samples.iter().map(|s| s.server_compute_ns).collect(); + let mut transport: Vec = samples.iter().map(|s| s.transport_overhead_ns).collect(); + rt.sort_unstable(); + server.sort_unstable(); + transport.sort_unstable(); + + let pct = |sorted: &[u128], p: f64| -> u128 { + if sorted.is_empty() { + return 0; + } + // Nearest-rank percentile — adequate at n=100. We avoid + // interpolation so the reported number is an actual + // observed value, not a synthetic midpoint. + let idx = ((p * (sorted.len() as f64)).ceil() as usize).saturating_sub(1); + sorted[idx.min(sorted.len() - 1)] + }; + + let mean = if rt.is_empty() { + 0 + } else { + rt.iter().sum::() / (rt.len() as u128) + }; + + Self { + n: samples.len(), + rt_p50_ns: pct(&rt, 0.50), + rt_p95_ns: pct(&rt, 0.95), + rt_p99_ns: pct(&rt, 0.99), + rt_max_ns: *rt.last().unwrap_or(&0), + rt_mean_ns: mean, + server_p50_ns: pct(&server, 0.50), + server_p95_ns: pct(&server, 0.95), + transport_p50_ns: pct(&transport, 0.50), + transport_p95_ns: pct(&transport, 0.95), + } + } +} + +/// Format a nanosecond count with the most readable unit. +fn fmt_ns(ns: u128) -> String { + if ns < 1_000 { + format!("{ns} ns") + } else if ns < 1_000_000 { + format!("{:.2} µs", (ns as f64) / 1_000.0) + } else if ns < 1_000_000_000 { + format!("{:.2} ms", (ns as f64) / 1_000_000.0) + } else { + format!("{:.3} s", (ns as f64) / 1_000_000_000.0) + } +} + +async fn one_call(client: &EmbeddingClient, text: &str, task: &str) -> anyhow::Result { + let t_send_unix_ns = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos(); + let t0 = Instant::now(); + + // `embed_one` calls `embed` with a single-element batch; the + // result carries the server-reported `duration_ms`. + let result = client.embed(&[text.to_string()], task, None).await?; + + let elapsed = t0.elapsed(); + let t_recv_unix_ns = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos(); + + let roundtrip_ns = elapsed.as_nanos(); + let server_compute_ns = (result.duration_ms as u128) * 1_000_000; + let transport_overhead_ns = roundtrip_ns.saturating_sub(server_compute_ns); + + Ok(Sample { + t_send_unix_ns, + t_recv_unix_ns, + roundtrip_ns, + server_compute_ns, + transport_overhead_ns, + }) +} + +/// Run a (size × concurrency) cell. +async fn run_cell( + client: Arc, + text: Arc, + task: &str, + iters: usize, + concurrency: usize, +) -> anyhow::Result> { + let mut all_samples: Vec = Vec::with_capacity(iters); + let task_str = task.to_string(); + + if concurrency == 1 { + for _ in 0..iters { + let s = one_call(&client, &text, &task_str).await?; + all_samples.push(s); + } + } else { + // Split iters across `concurrency` parallel tasks. Each task + // does its own sequential calls — the `concurrency` parameter + // models "how many simultaneous in-flight requests is the + // embedder receiving," which is what GPU2 contention sees + // when both agents fire ambient surfacing at once. + let per_worker = iters.div_ceil(concurrency); + let mut set = JoinSet::new(); + for _ in 0..concurrency { + let c = client.clone(); + let t = text.clone(); + let task_owned = task_str.clone(); + set.spawn(async move { + let mut local: Vec = Vec::with_capacity(per_worker); + for _ in 0..per_worker { + let s = one_call(&c, &t, &task_owned).await?; + local.push(s); + } + Ok::, anyhow::Error>(local) + }); + } + while let Some(res) = set.join_next().await { + let local = res??; + all_samples.extend(local); + } + // Trim any overshoot from the rounding in `per_worker`. + all_samples.truncate(iters); + } + + Ok(all_samples) +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + // Allow override of the default socket via env, useful for + // pointing at a moved embedder later (e.g. GPU0 co-resident + // experiment) without recompiling. + let socket = env::var("WEAVER_EMBEDDER_SOCKET") + .map(PathBuf::from) + .unwrap_or_else(|_| PathBuf::from("/run/weaver/embedder.sock")); + + println!("# Embedder latency characterization"); + println!(); + println!("**Date:** {}", chrono::Utc::now().to_rfc3339()); + println!("**Socket:** `{}`", socket.display()); + println!("**Task:** `retrieval.query`"); + println!(); + println!("Per-call timing uses `Instant::now()` for monotonic round-trip and"); + println!("`SystemTime` (Unix nanoseconds) for cross-process correlation."); + println!("Server-reported compute time is in milliseconds; transport overhead"); + println!("= round-trip − server compute."); + println!(); + + let cfg = EmbeddingClientConfig { + endpoint: EmbeddingEndpoint::Unix(socket), + timeout: Duration::from_secs(30), + connect_timeout: Duration::from_secs(5), + }; + + println!("## Connecting…"); + let client = EmbeddingClient::connect(cfg).await?; + let client = Arc::new(client); + println!("Connected."); + println!(); + + // Input size sweep. Sizes chosen from real trace data + // (median 2 KB, max ~5 KB) plus an upper-tail synthetic. + let sizes: &[(usize, &str)] = &[ + (50, "small_50B"), + (500, "small_500B"), + (2_000, "median_2KB"), + (5_000, "max_observed_5KB"), + (16_000, "upper_tail_16KB"), + ]; + + let task = "retrieval.query"; + let warmup_iters = 5; + let measured_iters = 100; + let concurrencies = [1usize, 2, 4]; + + println!("## Warmup"); + println!(); + println!("{warmup_iters} calls at the median size to warm the GPU and bring weights into HBM."); + println!(); + let warm_text = Arc::new("a".repeat(2_000)); + for i in 0..warmup_iters { + let s = one_call(&client, &warm_text, task).await?; + println!( + "warmup {} → roundtrip {}, server {} ms", + i + 1, + fmt_ns(s.roundtrip_ns), + s.server_compute_ns / 1_000_000, + ); + } + println!(); + + // Run each (size × concurrency) cell. + println!("## Results"); + println!(); + println!( + "| Size | Concurrency | n | RT P50 | RT P95 | RT P99 | RT max | RT mean | Server P50 | Server P95 | Transport P50 | Transport P95 |" + ); + println!("|---|---|---|---|---|---|---|---|---|---|---|---|"); + + let mut all_cells: Vec<(String, usize, CellStats)> = Vec::new(); + + for (size_bytes, size_label) in sizes { + // Synthetic payload of the target size. We don't care about + // semantics — Jina V4's compute cost depends on tokenized + // length, and 'a' chars tokenize predictably without + // exercising any vocab edge cases. + let text = Arc::new("a".repeat(*size_bytes)); + + for concurrency in concurrencies { + print!( + "{} (B={}) × c={} → running…", + size_label, size_bytes, concurrency + ); + let samples = run_cell( + client.clone(), + text.clone(), + task, + measured_iters, + concurrency, + ) + .await?; + let stats = CellStats::from_samples(&samples); + println!(" done."); + + println!( + "| {} (`{} B`) | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} |", + size_label, + size_bytes, + concurrency, + stats.n, + fmt_ns(stats.rt_p50_ns), + fmt_ns(stats.rt_p95_ns), + fmt_ns(stats.rt_p99_ns), + fmt_ns(stats.rt_max_ns), + fmt_ns(stats.rt_mean_ns), + fmt_ns(stats.server_p50_ns), + fmt_ns(stats.server_p95_ns), + fmt_ns(stats.transport_p50_ns), + fmt_ns(stats.transport_p95_ns), + ); + + all_cells.push((format!("{size_label} ({size_bytes} B)"), concurrency, stats)); + } + } + + println!(); + println!("## OOM-test reading"); + println!(); + println!("**Reference points** (per `latency is the enemy of agency`):"); + println!("- 100 ns target = harness IPC floor (decisively faster than network)."); + println!("- 1-10 ms = industry-standard \"good\" network latency (the bar to beat by ≥1 OOM)."); + println!("- ≥1 ms = same-OOM-as-network territory; harness loses its claimed advantage."); + println!(); + let single_cells: Vec<&CellStats> = all_cells + .iter() + .filter(|(_, c, _)| *c == 1) + .map(|(_, _, s)| s) + .collect(); + if let Some(median_cell) = single_cells.get(2) { + let rt_p95 = median_cell.rt_p95_ns; + let in_ms = (rt_p95 as f64) / 1_000_000.0; + let oom_vs_network = if rt_p95 > 0 { + (10_000_000_f64 / rt_p95 as f64).log10() + } else { + f64::INFINITY + }; + println!( + "Median-size (~2 KB), c=1, P95 round-trip: **{}** ({:.2} ms).", + fmt_ns(rt_p95), + in_ms + ); + println!("vs 10 ms network: {:.2} OOM faster.", oom_vs_network); + println!( + "vs 100 ns transport floor: {:.2} OOM slower.", + ((rt_p95 as f64) / 100.0).log10() + ); + println!(); + println!( + "**Reading:** the embedder is in the millisecond range, comparable to network latency. \ + It is part of the harness but its compute cost puts it in the same OOM as the external \ + environment it is supposed to beat. Ambient surfacing in Option B would have to either \ + (a) fire rarely enough that millisecond cost amortizes, or (b) move the embedder into a \ + topology where the compute cost is hidden behind cheaper transport (e.g. co-resident on \ + the agent's GPU with intra-VRAM transfer)." + ); + } + + Ok(()) +} diff --git a/crates/weaver-demo/src/herobench/belief.rs b/crates/weaver-demo/src/herobench/belief.rs index d4b227e5..8578e9fb 100644 --- a/crates/weaver-demo/src/herobench/belief.rs +++ b/crates/weaver-demo/src/herobench/belief.rs @@ -1629,7 +1629,7 @@ pub enum HypothesisDedupError { /// consume attempt numbers. pub async fn dedup_and_upsert_attempt_hypothesis( pool: &ArangoPool, - embedder: &weaver_embedding::grpc_client::EmbeddingClient, + embedder: &weaver_spu::encoder::grpc_client_legacy::EmbeddingClient, task_name: &str, attempt_num: usize, hypothesis: &str, diff --git a/crates/weaver-demo/src/herobench/benchmark.rs b/crates/weaver-demo/src/herobench/benchmark.rs index 58c45fa7..185e1609 100644 --- a/crates/weaver-demo/src/herobench/benchmark.rs +++ b/crates/weaver-demo/src/herobench/benchmark.rs @@ -914,9 +914,9 @@ pub async fn run_benchmark(config: BenchmarkConfig) -> Result { // stays on the degraded path). This preserves benchmark viability on // machines without the embedder while still wiring the gate for any run // that has one up. - let embedder: Option> = + let embedder: Option> = if db_pool.is_some() { - match weaver_embedding::grpc_client::EmbeddingClient::connect_default().await + match weaver_spu::encoder::grpc_client_legacy::EmbeddingClient::connect_default().await { Ok(client) => match client.ensure_ready().await { Ok(_) => Some(Arc::new(client)), diff --git a/crates/weaver-demo/tests/herobench_integration.rs b/crates/weaver-demo/tests/herobench_integration.rs index b0eddf61..bb929bd9 100644 --- a/crates/weaver-demo/tests/herobench_integration.rs +++ b/crates/weaver-demo/tests/herobench_integration.rs @@ -2294,8 +2294,8 @@ async fn integration_graph_stats() { // --ignored integration_dedup // --------------------------------------------------------------------------- -async fn try_connect_embedder() -> Option { - let client = match weaver_embedding::grpc_client::EmbeddingClient::connect_default().await { +async fn try_connect_embedder() -> Option { + let client = match weaver_spu::encoder::grpc_client_legacy::EmbeddingClient::connect_default().await { Ok(c) => c, Err(e) => { eprintln!("skipping: embedder not reachable ({e})"); diff --git a/crates/weaver-demo/tests/similarity_calibration.rs b/crates/weaver-demo/tests/similarity_calibration.rs index 5f01838b..1ccb5632 100644 --- a/crates/weaver-demo/tests/similarity_calibration.rs +++ b/crates/weaver-demo/tests/similarity_calibration.rs @@ -24,7 +24,7 @@ use weaver_demo::herobench::similarity::{ Classification, DUPLICATE_PAIRS, DUPLICATE_THRESHOLD, MemoryCandidate, NEAR_DUPLICATE_THRESHOLD, NOVEL_PAIRS, classify, }; -use weaver_embedding::grpc_client::EmbeddingClient; +use weaver_spu::encoder::grpc_client_legacy::EmbeddingClient; /// Embed one hypothesis string into a candidate with no key — callers /// wrap the result with a key appropriate to the assertion being made. diff --git a/crates/weaver-embedding/Cargo.toml b/crates/weaver-embedding/Cargo.toml deleted file mode 100644 index 094b1dbf..00000000 --- a/crates/weaver-embedding/Cargo.toml +++ /dev/null @@ -1,24 +0,0 @@ -[package] -name = "weaver-embedding" -version.workspace = true -edition.workspace = true -rust-version.workspace = true -license.workspace = true -description = "Deprecated re-export shell — content folded into weaver-spu (encoder side) + weaver-core (Embedder trait) per PR-0.5.B + PR-0.5.D; removed in PR-0.5.E." - -[features] -# Forward feature flags to weaver-spu so consumers that opt into -# `gguf` (legacy GGUF encoder backend + Persephone gRPC client) via -# this crate still get the corresponding weaver-spu feature enabled. -# Removed in PR-0.5.E. -default = [] -gguf = ["weaver-spu/gguf"] - -[dependencies] -# Sole runtime deps: weaver-core (for the relocated `Embedder` trait -# + types) and weaver-spu (for the relocated late_chunk / pin / -# legacy gRPC + GGUF backends + proto module). -# `default-features = false` on weaver-spu to avoid feature leakage — -# this shell crate forwards features explicitly via the `gguf` flag. -weaver-core = { workspace = true } -weaver-spu = { workspace = true, default-features = false } diff --git a/crates/weaver-embedding/src/lib.rs b/crates/weaver-embedding/src/lib.rs deleted file mode 100644 index 9b80de17..00000000 --- a/crates/weaver-embedding/src/lib.rs +++ /dev/null @@ -1,104 +0,0 @@ -//! Deprecated re-export shell — `weaver-embedding`'s encoder-side -//! modules folded into `weaver-spu` per `docs/specs/weaver-spu-Spec.md` -//! PR-0.5.D. -//! -//! New code should import: -//! - `Embedder` trait + types: `weaver_core::embedder::*` -//! (relocated in PR-0.5.B). -//! - `late_chunk` + `LateChunkConfig` / `LateChunkResult` / -//! `late_chunk_embeddings`: `weaver_spu::encoder::*`. -//! - gRPC client (`EmbeddingClient`, `EmbeddingClientConfig`, -//! `EmbeddingEndpoint`): `weaver_spu::encoder::grpc_client_legacy` -//! (always available — production embedder backend during the -//! migration window; retires in PR-3.A). -//! - GGUF embedder backend: `weaver_spu::encoder::gguf_backend` -//! (gated by `gguf` feature; retires in PR-3.B). -//! - Cohort-pin guard: `weaver_spu::core::pin` (relocated in -//! PR-0.5.D). -//! - Persephone proto: `weaver_spu::proto::embedding::*` (always -//! available; retires in PR-3.A). -//! -//! This crate stays in the workspace through the migration window so -//! existing consumers compile unchanged. Removed in PR-0.5.E along -//! with `weaver-inference` and the rest of the legacy crates. - -// Module-scoped allow — this entire crate is a deprecated shell. -// Every re-export below points at a stable weaver-spu or weaver-core -// module. External consumers see deprecation warnings on their own -// `use` lines. -#![allow(deprecated)] -#![doc(html_root_url = "https://docs.rs/weaver-embedding/0.1.0")] - -// `Embedder` trait + types (relocated in PR-0.5.B). -#[deprecated( - since = "0.1.0", - note = "moved to weaver_core::embedder::Embedder; this re-export goes away in PR-0.5.E" -)] -pub use weaver_core::embedder::Embedder; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_core::embedder::EmbedderInfo; this alias goes away in PR-0.5.E" -)] -pub type EmbedderInfo = weaver_core::embedder::EmbedderInfo; - -// Late-chunking surface (relocated in PR-0.5.D). -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::encoder::late_chunk::LateChunkConfig; this alias goes away in PR-0.5.E" -)] -pub type LateChunkConfig = weaver_spu::encoder::LateChunkConfig; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::encoder::late_chunk::LateChunkResult; this alias goes away in PR-0.5.E" -)] -pub type LateChunkResult = weaver_spu::encoder::LateChunkResult; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::encoder::late_chunk::late_chunk_embeddings; consumers should call the function via its new path" -)] -pub use weaver_spu::encoder::late_chunk_embeddings; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::encoder::late_chunk; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::encoder::late_chunk; - -// Cohort-pin guard (relocated in PR-0.5.D). -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::core::pin; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::core::pin; - -// gRPC client + proto (always-available; the gRPC client is the -// production embedder backend during the migration window). -// Retires in PR-3.A. -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::encoder::grpc_client_legacy; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::encoder::grpc_client_legacy as grpc_client; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::proto; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::proto; - -// GGUF backend (feature-gated; legacy in-process llama.cpp embedder -// path; retires in PR-3.B). -#[cfg(feature = "gguf")] -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::encoder::gguf_backend; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::encoder::gguf_backend; - -// Convenience top-level re-exports preserving the prior shape: -// `use weaver_embedding::{LateChunkConfig, LateChunkResult, -// late_chunk_embeddings};` continues to compile, but each name is -// individually deprecated above so consumers see warnings. diff --git a/crates/weaver-inference/Cargo.toml b/crates/weaver-inference/Cargo.toml deleted file mode 100644 index 320cb025..00000000 --- a/crates/weaver-inference/Cargo.toml +++ /dev/null @@ -1,32 +0,0 @@ -[package] -name = "weaver-inference" -description = "Deprecated re-export shell — content folded into weaver-spu per PR-0.5.C; removed in PR-0.5.E" -edition.workspace = true -rust-version.workspace = true -version.workspace = true -license.workspace = true - -[features] -# Forward feature flags to weaver-spu so consumers that opt into -# `cuda` or `gguf` via this crate still get the corresponding -# weaver-spu feature enabled. Removed in PR-0.5.E. -default = [] -cuda = ["weaver-spu/cuda"] -gguf = ["weaver-spu/gguf"] - -[dependencies] -# Sole dependency: the new home of everything this crate used to host. -# `default-features = false` prevents accidental feature leakage — -# `weaver-spu`'s default feature set is empty today but could grow in -# future PRs (e.g., enabling candle by default), and we don't want a -# transient consumer of this deprecated shell to silently activate -# heavier downstream features. This crate's `cuda` and `gguf` features -# explicitly forward to the corresponding `weaver-spu` features -# above, which is the only intended path. -weaver-spu = { workspace = true, default-features = false } - -# Note: the binary `weaver-infer` moved to weaver-spu in PR-0.5.C -# (now `weaver-spu/src/main.rs`). The `[[bin]]` section is gone here; -# operators run `cargo run -p weaver-spu --bin weaver-infer`. The -# legacy `cargo run -p weaver-inference --bin weaver-infer` no longer -# resolves; spec PR-0.5.C calls this out. diff --git a/crates/weaver-inference/examples/inference.toml b/crates/weaver-inference/examples/inference.toml deleted file mode 100644 index 6c5c0120..00000000 --- a/crates/weaver-inference/examples/inference.toml +++ /dev/null @@ -1,35 +0,0 @@ -# weaver-inference configuration -# -# Usage: -# weaver-infer --config examples/inference.toml -# -# All fields except model_id have defaults. -# CLI args override values from this file. - -# HuggingFace model ID -model_id = "Qwen/Qwen2.5-3B" - -# Tensor parallelism degree (1 = single GPU, 2 = dual GPU) -tp_size = 1 - -# Explicit GPU ordinals (0-indexed CUDA device IDs). -# Omit to auto-select. Length must match tp_size. -# -# Todd's workstation: -# 0 = NVIDIA RTX A6000 (48GB, sm_86) -# 1 = NVIDIA RTX A6000 (48GB, sm_86) -# 2 = NVIDIA RTX 2000 Ada (16GB, sm_89) -# -# Examples: -# gpu_ordinals = [2] # Small model on the Ada -# gpu_ordinals = [0, 1] # Large model across both A6000s (tp_size = 2) -gpu_ordinals = [2] - -# Maximum sequence length (prompt + generation) -max_seq_len = 4096 - -# Unix socket path (set to "" to disable) -socket_path = "/run/weaver/inference.sock" - -# TCP listener address (uncomment for remote access) -# listen_addr = "0.0.0.0:8080" diff --git a/crates/weaver-inference/kernels/transformer.cu b/crates/weaver-inference/kernels/transformer.cu deleted file mode 100644 index 9eba1359..00000000 --- a/crates/weaver-inference/kernels/transformer.cu +++ /dev/null @@ -1,1266 +0,0 @@ -// Transformer inference kernels for Qwen/Llama/Mistral/Gemma4 architectures. -// -// All kernels operate on f16 data (__half). Element-wise operations -// accumulate in f32 internally for numerical stability. -// -// Each __global__ kernel has a corresponding extern "C" host-side -// launcher function that the Rust FFI calls. The launcher computes -// grid/block dimensions and invokes the kernel. -// -// Compiled to SASS for sm_86 (A6000) and sm_89 (RTX Ada) via build.rs. - -#include -#include -#include - -// Block size for 1D kernels -#define BLOCK_SIZE 256 - -// ============================================================================ -// RMSNorm: x_norm = x * rsqrt(mean(x^2) + eps) * weight -// ============================================================================ - -static __global__ void rmsnorm_kernel( - const half* __restrict__ input, - const half* __restrict__ weight, - half* __restrict__ output, - int hidden_size, - float eps -) { - int row = blockIdx.x; - int tid = threadIdx.x; - int stride = blockDim.x; - - const half* x = input + (long long)row * hidden_size; - half* out = output + (long long)row * hidden_size; - - extern __shared__ float sdata[]; - - float local_sum = 0.0f; - for (int i = tid; i < hidden_size; i += stride) { - float val = __half2float(x[i]); - local_sum += val * val; - } - sdata[tid] = local_sum; - __syncthreads(); - - for (int s = stride / 2; s > 0; s >>= 1) { - if (tid < s) { - sdata[tid] += sdata[tid + s]; - } - __syncthreads(); - } - - float rms = rsqrtf(sdata[0] / (float)hidden_size + eps); - - for (int i = tid; i < hidden_size; i += stride) { - float val = __half2float(x[i]) * rms; - out[i] = __float2half(val * __half2float(weight[i])); - } -} - -extern "C" void launch_rmsnorm( - const void* input, - const void* weight, - void* output, - int batch_size, - int hidden_size, - float eps, - void* stream -) { - int threads = (hidden_size < BLOCK_SIZE) ? hidden_size : BLOCK_SIZE; - int smem = threads * sizeof(float); - rmsnorm_kernel<<>>( - (const half*)input, (const half*)weight, (half*)output, - hidden_size, eps - ); -} - -// ============================================================================ -// RoPE: Rotary Position Embedding (rotary-half convention) -// -// Qwen/Llama/Mistral use rotary-half: pairs are (d[i], d[i + half_dim]) -// NOT interleaved pairs (d[2i], d[2i+1]). -// -// For each pair: -// new_d[i] = d[i] * cos(angle) - d[i + half_dim] * sin(angle) -// new_d[i + half_dim] = d[i] * sin(angle) + d[i + half_dim] * cos(angle) -// ============================================================================ - -static __global__ void rope_kernel( - half* __restrict__ qk, // [seq_len, num_heads, head_dim] - int num_heads, - int head_dim, - int pos_offset, - float theta_base -) { - int seq_idx = blockIdx.x; - int head = blockIdx.y; - int tid = threadIdx.x; - int half_dim = head_dim / 2; - - if (tid >= half_dim || head >= num_heads) return; - - int pos = seq_idx + pos_offset; - float freq = 1.0f / powf(theta_base, (float)(2 * tid) / (float)head_dim); - float angle = (float)pos * freq; - float cos_val = cosf(angle); - float sin_val = sinf(angle); - - long long base_idx = ((long long)seq_idx * num_heads + head) * head_dim; - float v0 = __half2float(qk[base_idx + tid]); - float v1 = __half2float(qk[base_idx + tid + half_dim]); - qk[base_idx + tid] = __float2half(v0 * cos_val - v1 * sin_val); - qk[base_idx + tid + half_dim] = __float2half(v0 * sin_val + v1 * cos_val); -} - -extern "C" void launch_rope( - void* q, - void* k, - int seq_len, - int num_heads, - int num_kv_heads, - int head_dim, - int pos_offset, - float theta_base, - void* stream -) { - int threads = head_dim / 2; - if (threads > BLOCK_SIZE) threads = BLOCK_SIZE; - dim3 grid_q(seq_len, num_heads); - dim3 grid_k(seq_len, num_kv_heads); - - rope_kernel<<>>( - (half*)q, num_heads, head_dim, pos_offset, theta_base - ); - rope_kernel<<>>( - (half*)k, num_kv_heads, head_dim, pos_offset, theta_base - ); -} - -// ============================================================================ -// SwiGLU: output = silu(gate) * up, where silu(x) = x * sigmoid(x) -// ============================================================================ - -static __global__ void swiglu_kernel( - const half* __restrict__ gate, - const half* __restrict__ up, - half* __restrict__ output, - int total_elements -) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= total_elements) return; - - float g = __half2float(gate[idx]); - float u = __half2float(up[idx]); - float silu_g = g / (1.0f + expf(-g)); - output[idx] = __float2half(silu_g * u); -} - -extern "C" void launch_swiglu( - const void* gate, - const void* up, - void* output, - int total_elements, - void* stream -) { - int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; - swiglu_kernel<<>>( - (const half*)gate, (const half*)up, (half*)output, total_elements - ); -} - -// ============================================================================ -// Residual add: output += residual (in-place) -// ============================================================================ - -static __global__ void residual_add_kernel( - half* __restrict__ output, - const half* __restrict__ residual, - int total_elements -) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= total_elements) return; - float val = __half2float(output[idx]) + __half2float(residual[idx]); - output[idx] = __float2half(val); -} - -extern "C" void launch_residual_add( - void* output, - const void* residual, - int total_elements, - void* stream -) { - int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; - residual_add_kernel<<>>( - (half*)output, (const half*)residual, total_elements - ); -} - -// ============================================================================ -// Bias add: output[pos * dim + d] += bias[d] for all positions -// ============================================================================ - -static __global__ void bias_add_kernel( - half* __restrict__ output, - const half* __restrict__ bias, - int dim, - int seq_len -) { - int pos = blockIdx.x; - int tid = threadIdx.x; - int stride = blockDim.x; - - if (pos >= seq_len) return; - - half* out = output + (long long)pos * dim; - for (int d = tid; d < dim; d += stride) { - float val = __half2float(out[d]) + __half2float(bias[d]); - out[d] = __float2half(val); - } -} - -extern "C" void launch_bias_add( - void* output, - const void* bias, - int dim, - int seq_len, - void* stream -) { - int threads = (dim < BLOCK_SIZE) ? dim : BLOCK_SIZE; - bias_add_kernel<<>>( - (half*)output, (const half*)bias, dim, seq_len - ); -} - -// ============================================================================ -// Embedding lookup: output[i] = embed_table[token_ids[i]] -// ============================================================================ - -static __global__ void embedding_kernel( - const half* __restrict__ embed_table, - const int* __restrict__ token_ids, - half* __restrict__ output, - int hidden_size, - int seq_len -) { - int pos = blockIdx.x; - int tid = threadIdx.x; - int stride = blockDim.x; - - if (pos >= seq_len) return; - - int token_id = token_ids[pos]; - const half* row = embed_table + (long long)token_id * hidden_size; - half* out = output + (long long)pos * hidden_size; - - for (int i = tid; i < hidden_size; i += stride) { - out[i] = row[i]; - } -} - -extern "C" void launch_embedding( - const void* embed_table, - const int* token_ids, - void* output, - int hidden_size, - int seq_len, - void* stream -) { - int threads = (hidden_size < BLOCK_SIZE) ? hidden_size : BLOCK_SIZE; - embedding_kernel<<>>( - (const half*)embed_table, token_ids, (half*)output, - hidden_size, seq_len - ); -} - -// ============================================================================ -// Causal attention (single-head, strided access for GQA) -// -// Q, K, V, output may be strided — heads are interleaved within each -// position in the multi-head tensor. The stride parameters tell the -// kernel how many elements to skip between consecutive positions. -// -// For prefill (seq_len > 1): full QK^T + causal mask + softmax + V -// For decode (seq_len == 1): dot product attention against KV cache -// -// This is a simple implementation. For production, use FlashAttention. -// ============================================================================ - -static __global__ void attention_scores_kernel( - const half* __restrict__ q, // [seq_len, ...] one head, strided - const half* __restrict__ k, // [total_len, ...] one KV head, strided - float* __restrict__ scores, // [seq_len, total_len] - int seq_len, - int total_len, - int head_dim, - int q_stride, // elements between positions in Q - int kv_stride, // elements between positions in K - float scale -) { - int q_pos = blockIdx.x; - int tid = threadIdx.x; - int stride = blockDim.x; - - if (q_pos >= seq_len) return; - - const half* q_row = q + (long long)q_pos * q_stride; - - for (int k_pos = tid; k_pos < total_len; k_pos += stride) { - // Causal mask: only attend to positions <= q_pos + (total_len - seq_len) - int q_abs = q_pos + (total_len - seq_len); - if (k_pos > q_abs) { - scores[q_pos * total_len + k_pos] = -1e9f; - continue; - } - - const half* k_row = k + (long long)k_pos * kv_stride; - float dot = 0.0f; - for (int d = 0; d < head_dim; d++) { - dot += __half2float(q_row[d]) * __half2float(k_row[d]); - } - scores[q_pos * total_len + k_pos] = dot * scale; - } -} - -static __global__ void softmax_kernel( - float* __restrict__ data, - int cols -) { - int row = blockIdx.x; - int tid = threadIdx.x; - int stride = blockDim.x; - float* row_data = data + (long long)row * cols; - - extern __shared__ float sdata[]; - - // Find max - float local_max = -1e30f; - for (int i = tid; i < cols; i += stride) { - if (row_data[i] > local_max) local_max = row_data[i]; - } - sdata[tid] = local_max; - __syncthreads(); - - for (int s = stride / 2; s > 0; s >>= 1) { - if (tid < s) sdata[tid] = fmaxf(sdata[tid], sdata[tid + s]); - __syncthreads(); - } - float max_val = sdata[0]; - - // Sum of exp - float local_sum = 0.0f; - for (int i = tid; i < cols; i += stride) { - row_data[i] = expf(row_data[i] - max_val); - local_sum += row_data[i]; - } - sdata[tid] = local_sum; - __syncthreads(); - - for (int s = stride / 2; s > 0; s >>= 1) { - if (tid < s) sdata[tid] += sdata[tid + s]; - __syncthreads(); - } - - float inv_sum = 1.0f / sdata[0]; - for (int i = tid; i < cols; i += stride) { - row_data[i] *= inv_sum; - } -} - -static __global__ void attention_output_kernel( - const float* __restrict__ scores, // [seq_len, total_len] - const half* __restrict__ v, // [total_len, ...] one KV head, strided - half* __restrict__ output, // [seq_len, ...] one head, strided - int seq_len, - int total_len, - int head_dim, - int kv_stride, // elements between positions in V - int out_stride // elements between positions in output -) { - int q_pos = blockIdx.x; - int d = threadIdx.x; - - if (q_pos >= seq_len || d >= head_dim) return; - - const float* score_row = scores + (long long)q_pos * total_len; - - float acc = 0.0f; - for (int k_pos = 0; k_pos < total_len; k_pos++) { - acc += score_row[k_pos] * __half2float(v[(long long)k_pos * kv_stride + d]); - } - output[(long long)q_pos * out_stride + d] = __float2half(acc); -} - -extern "C" void launch_attention( - const void* q, // [seq_len, ...] one head (strided) - const void* k, // [total_len, ...] one KV head (strided) - const void* v, // [total_len, ...] one KV head (strided) - void* output, // [seq_len, ...] one head (strided) - void* scores_buf, // [seq_len, total_len] f32 scratch - int seq_len, - int total_len, - int head_dim, - int q_stride, // stride between positions in Q/output - int kv_stride, // stride between positions in K/V - int out_stride, // stride between positions in output - float scale, - void* stream -) { - cudaStream_t s = (cudaStream_t)stream; - - // Step 1: QK^T scores with causal mask - int threads1 = (total_len < BLOCK_SIZE) ? total_len : BLOCK_SIZE; - attention_scores_kernel<<>>( - (const half*)q, (const half*)k, (float*)scores_buf, - seq_len, total_len, head_dim, q_stride, kv_stride, scale - ); - - // Step 2: Softmax - int threads2 = (total_len < BLOCK_SIZE) ? total_len : BLOCK_SIZE; - int t2 = 1; - while (t2 < threads2) t2 <<= 1; - if (t2 > BLOCK_SIZE) t2 = BLOCK_SIZE; - int smem = t2 * sizeof(float); - softmax_kernel<<>>( - (float*)scores_buf, total_len - ); - - // Step 3: Score * V - int threads3 = (head_dim < BLOCK_SIZE) ? head_dim : BLOCK_SIZE; - attention_output_kernel<<>>( - (const float*)scores_buf, (const half*)v, (half*)output, - seq_len, total_len, head_dim, kv_stride, out_stride - ); -} - -// ============================================================================ -// Split fused projection output into separate contiguous buffers. -// -// After a fused QKV or Gate+Up GEMM, the output is [seq_len, total_dim] -// with sub-tensors interleaved per row. This kernel deinterleaves into -// separate contiguous output buffers. -// ============================================================================ - -static __global__ void split_fused_kernel( - const half* __restrict__ fused, // [seq_len, total_dim] - half* __restrict__ out_a, // [seq_len, dim_a] - half* __restrict__ out_b, // [seq_len, dim_b] - half* __restrict__ out_c, // [seq_len, dim_c] or NULL for 2-way split - int seq_len, - int total_dim, - int dim_a, - int dim_b -) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - int total = seq_len * total_dim; - if (idx >= total) return; - - int pos = idx / total_dim; - int d = idx % total_dim; - half val = fused[idx]; - - if (d < dim_a) { - out_a[pos * dim_a + d] = val; - } else if (d < dim_a + dim_b) { - out_b[pos * dim_b + (d - dim_a)] = val; - } else if (out_c) { - int dim_c = total_dim - dim_a - dim_b; - out_c[pos * dim_c + (d - dim_a - dim_b)] = val; - } -} - -extern "C" void launch_split_fused( - const void* fused, - void* out_a, - void* out_b, - void* out_c, // NULL for 2-way split - int seq_len, - int total_dim, - int dim_a, - int dim_b, - void* stream -) { - int total = seq_len * total_dim; - int blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE; - split_fused_kernel<<>>( - (const half*)fused, - (half*)out_a, (half*)out_b, (half*)out_c, - seq_len, total_dim, dim_a, dim_b - ); -} - -// ============================================================================ -// FlashAttention 2 — fused tiled attention with online softmax -// -// Replaces the 3-kernel naive pipeline (scores → softmax → output) with a -// single kernel that tiles Q into Br-row blocks, iterates K/V in Bc-row -// blocks, and uses online softmax (running max + sum) so the full -// [seq_len × total_len] attention matrix is never materialized. -// -// Handles GQA natively: multiple Q heads share the same KV head. -// -// Grid: (ceil(seq_len / BR), num_heads) -// Block: 128 threads -// Smem: Q tile (16 KB) + KV tile (16 KB) + score tile (16 KB) = 48 KB -// ============================================================================ - -#define FA2_BR 64 -#define FA2_BC 64 -#define FA2_THREADS 128 -#define FA2_MAX_HALF_DIM 128 // head_dim / 2, supports up to head_dim=256 - -static __global__ void __launch_bounds__(FA2_THREADS, 4) -flash_attention_2_kernel( - const half* __restrict__ Q, // [seq_len, num_heads * head_dim] - const half* __restrict__ K, // [total_len, num_kv_heads * head_dim] - const half* __restrict__ V, // [total_len, num_kv_heads * head_dim] - half* __restrict__ O, // [seq_len, num_heads * head_dim] - int seq_len, - int total_len, - int num_heads, - int num_kv_heads, - int head_dim, - float scale, - int causal -) { - int q_tile_idx = blockIdx.x; - int head = blockIdx.y; - int tid = threadIdx.x; - - int br_start = q_tile_idx * FA2_BR; - int gqa_groups = num_heads / num_kv_heads; - int kv_head = head / gqa_groups; - int pos_offset = total_len - seq_len; - - // Thread-to-output mapping: 2 threads per Q row, each handles half of head_dim - int my_row = tid % FA2_BR; // 0..63 - int dim_half = tid / FA2_BR; // 0 or 1 - int half_dim = head_dim / 2; // 64 for hd=128 - int dim_start = dim_half * half_dim; - - // Global memory strides - int q_stride = num_heads * head_dim; - int kv_stride = num_kv_heads * head_dim; - - // Shared memory: Q tile | KV tile | Score tile - extern __shared__ char smem_raw[]; - half* smem_Q = (half*)smem_raw; - half* smem_KV = smem_Q + FA2_BR * head_dim; - float* smem_S = (float*)(smem_KV + FA2_BC * head_dim); - - // Online softmax state (per-thread, per-row) - float m_val = -1e30f; - float l_val = 0.0f; - - // Output accumulator — half_dim registers (64 for hd=128) - float O_acc[FA2_MAX_HALF_DIM]; - for (int d = 0; d < half_dim; d++) { - O_acc[d] = 0.0f; - } - - // ---- Load Q tile (stays resident for all KV iterations) ---- - { - int total_elems = FA2_BR * head_dim; - for (int idx = tid; idx < total_elems; idx += FA2_THREADS) { - int row = idx / head_dim; - int d = idx % head_dim; - int global_row = br_start + row; - if (global_row < seq_len) { - smem_Q[idx] = Q[(long long)global_row * q_stride + (long long)head * head_dim + d]; - } else { - smem_Q[idx] = __float2half(0.0f); - } - } - } - __syncthreads(); - - // ---- KV block loop ---- - int num_kv_blocks = (total_len + FA2_BC - 1) / FA2_BC; - - for (int kv_blk = 0; kv_blk < num_kv_blocks; kv_blk++) { - int kv_start = kv_blk * FA2_BC; - - // Causal early exit: all K positions in this block are beyond all Q positions - if (causal) { - int max_q_abs = br_start + FA2_BR - 1 + pos_offset; - if (kv_start > max_q_abs) break; - } - - // ---- Load K block ---- - { - int total_elems = FA2_BC * head_dim; - for (int idx = tid; idx < total_elems; idx += FA2_THREADS) { - int row = idx / head_dim; - int d = idx % head_dim; - int global_row = kv_start + row; - if (global_row < total_len) { - smem_KV[idx] = K[(long long)global_row * kv_stride + (long long)kv_head * head_dim + d]; - } else { - smem_KV[idx] = __float2half(0.0f); - } - } - } - __syncthreads(); - - // ---- S = Q @ K^T * scale (with causal mask) → smem_S ---- - { - int total_s = FA2_BR * FA2_BC; - for (int idx = tid; idx < total_s; idx += FA2_THREADS) { - int i = idx / FA2_BC; - int j = idx % FA2_BC; - - float dot = 0.0f; - for (int d = 0; d < head_dim; d++) { - dot += __half2float(smem_Q[i * head_dim + d]) - * __half2float(smem_KV[j * head_dim + d]); - } - dot *= scale; - - // Causal mask: Q at absolute position q_abs can only attend to k_abs <= q_abs - if (causal) { - int q_abs = br_start + i + pos_offset; - int k_abs = kv_start + j; - if (k_abs > q_abs) dot = -1e30f; - } - // Mask out-of-bounds rows - if ((br_start + i) >= seq_len || (kv_start + j) >= total_len) { - dot = -1e30f; - } - - smem_S[idx] = dot; - } - } - __syncthreads(); - - // ---- Online softmax update for my_row ---- - // Both threads sharing this row (dim_half=0,1) compute identical m/l values - float block_max = -1e30f; - for (int j = 0; j < FA2_BC; j++) { - float s = smem_S[my_row * FA2_BC + j]; - if (s > block_max) block_max = s; - } - - float m_new = fmaxf(m_val, block_max); - float alpha = expf(m_val - m_new); - - float block_sum = 0.0f; - for (int j = 0; j < FA2_BC; j++) { - block_sum += expf(smem_S[my_row * FA2_BC + j] - m_new); - } - - l_val = l_val * alpha + block_sum; - for (int d = 0; d < half_dim; d++) { - O_acc[d] *= alpha; - } - m_val = m_new; - - __syncthreads(); - - // ---- Load V block (reuse smem_KV, K no longer needed) ---- - { - int total_elems = FA2_BC * head_dim; - for (int idx = tid; idx < total_elems; idx += FA2_THREADS) { - int row = idx / head_dim; - int d = idx % head_dim; - int global_row = kv_start + row; - if (global_row < total_len) { - smem_KV[idx] = V[(long long)global_row * kv_stride + (long long)kv_head * head_dim + d]; - } else { - smem_KV[idx] = __float2half(0.0f); - } - } - } - __syncthreads(); - - // ---- O_acc += P @ V (P recomputed from S and m_val) ---- - for (int j = 0; j < FA2_BC; j++) { - float p_val = expf(smem_S[my_row * FA2_BC + j] - m_val); - if (p_val > 1e-10f) { - for (int d = 0; d < half_dim; d++) { - O_acc[d] += p_val * __half2float(smem_KV[j * head_dim + dim_start + d]); - } - } - } - - __syncthreads(); - } - - // ---- Normalize and write output ---- - int global_row = br_start + my_row; - if (global_row < seq_len) { - float inv_l = (l_val > 0.0f) ? (1.0f / l_val) : 0.0f; - long long out_base = (long long)global_row * q_stride + (long long)head * head_dim; - for (int d = 0; d < half_dim; d++) { - O[out_base + dim_start + d] = __float2half(O_acc[d] * inv_l); - } - } -} - -extern "C" void launch_flash_attention( - const void* Q, - const void* K, - const void* V, - void* O, - int seq_len, - int total_len, - int num_heads, - int num_kv_heads, - int head_dim, - float scale, - int causal, - void* stream -) { - int q_tiles = (seq_len + FA2_BR - 1) / FA2_BR; - dim3 grid(q_tiles, num_heads); - dim3 block(FA2_THREADS); - - int smem_bytes = FA2_BR * head_dim * sizeof(half) // smem_Q - + FA2_BC * head_dim * sizeof(half) // smem_KV - + FA2_BR * FA2_BC * sizeof(float); // smem_S - - flash_attention_2_kernel<<>>( - (const half*)Q, (const half*)K, (const half*)V, (half*)O, - seq_len, total_len, num_heads, num_kv_heads, head_dim, - scale, causal - ); -} - -// ============================================================================ -// Fused decode attention (seq_len=1) -// -// Optimized for next-token generation. Each warp independently processes a -// stream of K/V positions — NO __syncthreads in the hot loop. -// -// Thread layout (dynamic, adapts to head_dim): -// Block size = head_dim threads (must be a multiple of 32) -// nwarps = head_dim / 32, dpt = head_dim / 32 (dims per thread) -// Each thread handles dpt dimensions: dims [lane*dpt .. lane*dpt+dpt-1] -// Warp w processes positions w, w+nwarps, w+2*nwarps, ... (interleaved) -// -// Examples: head_dim=128 → 4 warps, 4 dpt. head_dim=160 → 5 warps, 5 dpt. -// -// Each warp computes its own partial online-softmax result. After the loop, -// warps combine via shared memory using the log-sum-exp trick (1 sync). -// -// Memory: consecutive threads read consecutive K/V elements → coalesced. -// Supports head_dim up to 256 (DA_MAX_DPT=8). -// ============================================================================ - -// Max dims per thread — supports head_dim up to 512 (512/32 = 16). -// Gemma 4 global attention layers use head_dim=512. -#define DA_MAX_DPT 16 - -static __global__ void decode_fused_attention_kernel( - const half* __restrict__ Q, // [num_heads, head_dim] - const half* __restrict__ K, // [total_len, num_kv_heads, head_dim] - const half* __restrict__ V, // [total_len, num_kv_heads, head_dim] - half* __restrict__ O, // [num_heads, head_dim] - int num_heads, - int num_kv_heads, - int head_dim, - int total_len, - float scale -) { - const int head = blockIdx.x; - const int tid = threadIdx.x; - const int warp_id = tid / 32; - const int lane = tid & 31; - - // Compute layout from head_dim (block size = head_dim) - const int dpt = head_dim / 32; // dims per thread - const int nwarps = head_dim / 32; // warps per block - - const int gqa_groups = num_heads / num_kv_heads; - const int kv_head = head / gqa_groups; - const int kv_stride = num_kv_heads * head_dim; - - // Each thread loads its dpt Q dimensions - float q[DA_MAX_DPT]; - for (int i = 0; i < dpt; i++) { - q[i] = __half2float(Q[head * head_dim + lane * dpt + i]); - } - - // Per-warp online softmax state (registers) - float m = -1e30f; - float l = 0.0f; - float acc[DA_MAX_DPT]; - for (int i = 0; i < dpt; i++) acc[i] = 0.0f; - - // Hot loop: each warp processes its own position stream - for (int pos = warp_id; pos < total_len; pos += nwarps) { - long long base = (long long)pos * kv_stride + kv_head * head_dim - + lane * dpt; - - // Dot product: each thread multiplies its dpt dims, warp reduces - float dot = 0.0f; - for (int i = 0; i < dpt; i++) { - dot += q[i] * __half2float(K[base + i]); - } - // Warp-level reduction (shuffle only — no shared mem, no sync!) - #pragma unroll - for (int offset = 16; offset > 0; offset >>= 1) { - dot += __shfl_xor_sync(0xFFFFFFFF, dot, offset); - } - float score = dot * scale; // all lanes have the score - - // Online softmax + V accumulation - float new_m = fmaxf(m, score); - float correction = expf(m - new_m); - float p = expf(score - new_m); - - for (int i = 0; i < dpt; i++) { - acc[i] = acc[i] * correction + p * __half2float(V[base + i]); - } - l = l * correction + p; - m = new_m; - } - - // === Combine warps' partial results (log-sum-exp trick) === - // Dynamic shared memory layout: [nwarps floats m][nwarps floats l][nwarps*head_dim floats acc] - extern __shared__ char da_smem_raw[]; - float* smem_m = (float*)da_smem_raw; - float* smem_l = smem_m + nwarps; - float* smem_acc = smem_l + nwarps; - - if (lane == 0) { - smem_m[warp_id] = m; - smem_l[warp_id] = l; - } - for (int i = 0; i < dpt; i++) { - smem_acc[warp_id * head_dim + lane * dpt + i] = acc[i]; - } - __syncthreads(); - - // Find global max across warps (all threads compute, it's cheap) - float global_m = smem_m[0]; - for (int w = 1; w < nwarps; w++) { - global_m = fmaxf(global_m, smem_m[w]); - } - - // Combine each warp's contribution for this thread's dimensions - float final_l = 0.0f; - float final_acc[DA_MAX_DPT]; - for (int i = 0; i < dpt; i++) final_acc[i] = 0.0f; - - for (int w = 0; w < nwarps; w++) { - float corr = expf(smem_m[w] - global_m); - final_l += smem_l[w] * corr; - for (int i = 0; i < dpt; i++) { - final_acc[i] += smem_acc[w * head_dim + lane * dpt + i] * corr; - } - } - - // Write output (coalesced: consecutive threads write consecutive dims) - float inv_l = 1.0f / final_l; - for (int i = 0; i < dpt; i++) { - O[head * head_dim + lane * dpt + i] = - __float2half(final_acc[i] * inv_l); - } -} - -extern "C" void launch_decode_attention( - const void* Q, - const void* K, - const void* V, - void* O, - int num_heads, - int num_kv_heads, - int head_dim, - int total_len, - float scale, - void* stream -) { - cudaStream_t s = (cudaStream_t)stream; - int nwarps = head_dim / 32; - // Shared: nwarps floats (m) + nwarps floats (l) + nwarps*head_dim floats (acc) - int smem_bytes = (nwarps * 2 + nwarps * head_dim) * sizeof(float); - - decode_fused_attention_kernel<<>>( - (const half*)Q, (const half*)K, (const half*)V, (half*)O, - num_heads, num_kv_heads, head_dim, total_len, scale - ); -} - -// ============================================================================ -// P2P allreduce sum (2-GPU NVLink) -// ============================================================================ - -static __global__ void allreduce_kernel( - half* __restrict__ local_buf, - const half* __restrict__ peer_buf, - int size -) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= size) return; - float val = __half2float(local_buf[idx]) + __half2float(peer_buf[idx]); - local_buf[idx] = __float2half(val); -} - -extern "C" void launch_allreduce_2gpu( - void* local_buf, - const void* peer_buf, - int num_elements, - void* stream -) { - int blocks = (num_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; - allreduce_kernel<<>>( - (half*)local_buf, (const half*)peer_buf, num_elements - ); -} - -// ============================================================================ -// Zero buffer: output[i] = 0 for all elements -// Used to clear MoE accumulator before expert dispatch. -// ============================================================================ - -static __global__ void zero_kernel(half* __restrict__ output, int total_elements) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= total_elements) return; - output[idx] = __float2half(0.0f); -} - -extern "C" void launch_zero( - void* output, - int total_elements, - void* stream -) { - int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; - zero_kernel<<>>( - (half*)output, total_elements - ); -} - -// ============================================================================ -// Fused scale-add: output[i] += scale * input[i] -// Used to accumulate weighted expert outputs in MoE dispatch. -// Computes in f32 internally for numerical stability. -// ============================================================================ - -static __global__ void scale_add_kernel( - half* __restrict__ output, - const half* __restrict__ input, - float scale, - int total_elements -) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= total_elements) return; - float val = __half2float(output[idx]) + scale * __half2float(input[idx]); - output[idx] = __float2half(val); -} - -extern "C" void launch_scale_add( - void* output, - const void* input, - float scale, - int total_elements, - void* stream -) { - int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; - scale_add_kernel<<>>( - (half*)output, (const half*)input, scale, total_elements - ); -} - -// ============================================================================ -// Gather rows: dst[i, :] = src[indices[i], :] for i in 0..num_rows -// Used to batch tokens assigned to the same expert into a contiguous buffer -// for efficient GEMM during MoE prefill. -// ============================================================================ - -static __global__ void gather_rows_kernel( - const half* __restrict__ src, - half* __restrict__ dst, - const int* __restrict__ indices, - int num_rows, - int row_dim -) { - int row = blockIdx.x; - if (row >= num_rows) return; - int src_row = indices[row]; - int tid = threadIdx.x; - for (int d = tid; d < row_dim; d += blockDim.x) { - dst[(long long)row * row_dim + d] = src[(long long)src_row * row_dim + d]; - } -} - -extern "C" void launch_gather_rows( - const void* src, - void* dst, - const int* indices, - int num_rows, - int row_dim, - void* stream -) { - if (num_rows == 0) return; - gather_rows_kernel<<>>( - (const half*)src, (half*)dst, indices, num_rows, row_dim - ); -} - -// ============================================================================ -// Weighted scatter-add: dst[indices[i], :] += weights[i] * src[i, :] -// Used to accumulate weighted expert outputs back to their token positions -// in mlp_out during MoE prefill. Each expert's scatter runs sequentially -// on the same stream, so no write races between experts. -// ============================================================================ - -static __global__ void weighted_scatter_add_kernel( - half* __restrict__ dst, - const half* __restrict__ src, - const int* __restrict__ indices, - const float* __restrict__ weights, - int num_rows, - int row_dim -) { - int row = blockIdx.x; - if (row >= num_rows) return; - int dst_row = indices[row]; - float w = weights[row]; - int tid = threadIdx.x; - for (int d = tid; d < row_dim; d += blockDim.x) { - float val = __half2float(dst[(long long)dst_row * row_dim + d]) - + w * __half2float(src[(long long)row * row_dim + d]); - dst[(long long)dst_row * row_dim + d] = __float2half(val); - } -} - -extern "C" void launch_weighted_scatter_add( - void* dst, - const void* src, - const int* indices, - const float* weights, - int num_rows, - int row_dim, - void* stream -) { - if (num_rows == 0) return; - weighted_scatter_add_kernel<<>>( - (half*)dst, (const half*)src, indices, weights, num_rows, row_dim - ); -} - -// ============================================================================ -// GeGLU: output = gelu_tanh(gate) * up (Gemma 4 activation) -// -// GELU with tanh approximation: -// gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3))) -// Combined with gated unit: output = gelu(gate) * up -// ============================================================================ - -static __global__ void geglu_kernel( - const half* __restrict__ gate, - const half* __restrict__ up, - half* __restrict__ output, - int total_elements -) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= total_elements) return; - - float g = __half2float(gate[idx]); - float u = __half2float(up[idx]); - - // GELU tanh approximation: 0.5 * g * (1 + tanh(sqrt(2/pi) * (g + 0.044715 * g^3))) - const float SQRT_2_OVER_PI = 0.7978845608f; // sqrt(2.0 / pi) - float inner = SQRT_2_OVER_PI * (g + 0.044715f * g * g * g); - float gelu_g = 0.5f * g * (1.0f + tanhf(inner)); - - output[idx] = __float2half(gelu_g * u); -} - -extern "C" void launch_geglu( - const void* gate, - const void* up, - void* output, - int total_elements, - void* stream -) { - int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; - geglu_kernel<<>>( - (const half*)gate, (const half*)up, (half*)output, total_elements - ); -} - -// ============================================================================ -// In-place scale: data[i] *= scale -// -// Used for: -// - Embedding scaling (sqrt(hidden_size)) after lookup -// - Per-layer scalar multiplication -// ============================================================================ - -static __global__ void scale_inplace_kernel( - half* __restrict__ data, - float scale, - int total_elements -) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= total_elements) return; - data[idx] = __float2half(__half2float(data[idx]) * scale); -} - -extern "C" void launch_scale_inplace( - void* data, - float scale, - int total_elements, - void* stream -) { - int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; - scale_inplace_kernel<<>>( - (half*)data, scale, total_elements - ); -} - -// ============================================================================ -// Logit soft-cap: logits[i] = cap * tanh(logits[i] / cap) -// -// Clamps logits to [-cap, +cap] range. Gemma 4 uses cap=30.0. -// ============================================================================ - -static __global__ void logit_softcap_kernel( - half* __restrict__ logits, - float cap, - int total_elements -) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= total_elements) return; - float val = __half2float(logits[idx]); - logits[idx] = __float2half(cap * tanhf(val / cap)); -} - -extern "C" void launch_logit_softcap( - void* logits, - float cap, - int total_elements, - void* stream -) { - int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE; - logit_softcap_kernel<<>>( - (half*)logits, cap, total_elements - ); -} - -// ============================================================================ -// Scale-free RMSNorm (no learnable weight): -// output = input * rsqrt(mean(input^2) + eps) -// -// Used for Gemma 4 V-norm (value states) and router input normalization. -// Same reduction as rmsnorm_kernel, but without the weight multiply. -// ============================================================================ - -static __global__ void rmsnorm_noscale_kernel( - const half* __restrict__ input, - half* __restrict__ output, - int dim, - float eps -) { - int row = blockIdx.x; - int tid = threadIdx.x; - int stride = blockDim.x; - - const half* x = input + (long long)row * dim; - half* out = output + (long long)row * dim; - - extern __shared__ float sdata[]; - - float local_sum = 0.0f; - for (int i = tid; i < dim; i += stride) { - float val = __half2float(x[i]); - local_sum += val * val; - } - sdata[tid] = local_sum; - __syncthreads(); - - for (int s = stride / 2; s > 0; s >>= 1) { - if (tid < s) { - sdata[tid] += sdata[tid + s]; - } - __syncthreads(); - } - - float rms = rsqrtf(sdata[0] / (float)dim + eps); - - for (int i = tid; i < dim; i += stride) { - out[i] = __float2half(__half2float(x[i]) * rms); - } -} - -extern "C" void launch_rmsnorm_noscale( - const void* input, - void* output, - int batch_size, - int dim, - float eps, - void* stream -) { - int threads = (dim < BLOCK_SIZE) ? dim : BLOCK_SIZE; - int smem = threads * sizeof(float); - rmsnorm_noscale_kernel<<>>( - (const half*)input, (half*)output, dim, eps - ); -} - -// ============================================================================ -// Partial RoPE: Apply rotary embeddings to only the first rotary_dim -// dimensions of each head, leaving the remaining dimensions unchanged. -// -// Gemma 4 global attention layers use partial_rotary_factor=0.25, meaning -// only 128 out of 512 head dimensions get rotary encoding. -// -// Uses the same rotary-half convention as launch_rope: -// pairs are (d[i], d[i + half_rotary]) for i in 0..half_rotary -// ============================================================================ - -static __global__ void rope_partial_kernel( - half* __restrict__ qk, // [seq_len, num_heads, head_dim] - int num_heads, - int head_dim, - int rotary_dim, // first rotary_dim dims get RoPE - int pos_offset, - float theta_base -) { - int seq_idx = blockIdx.x; - int head = blockIdx.y; - int tid = threadIdx.x; - int half_rotary = rotary_dim / 2; - - if (tid >= half_rotary || head >= num_heads) return; - - int pos = seq_idx + pos_offset; - float freq = 1.0f / powf(theta_base, (float)(2 * tid) / (float)rotary_dim); - float angle = (float)pos * freq; - float cos_val = cosf(angle); - float sin_val = sinf(angle); - - long long base_idx = ((long long)seq_idx * num_heads + head) * head_dim; - float v0 = __half2float(qk[base_idx + tid]); - float v1 = __half2float(qk[base_idx + tid + half_rotary]); - qk[base_idx + tid] = __float2half(v0 * cos_val - v1 * sin_val); - qk[base_idx + tid + half_rotary] = __float2half(v0 * sin_val + v1 * cos_val); - // Dimensions [rotary_dim, head_dim) are untouched -} - -extern "C" void launch_rope_partial( - void* q, - void* k, - int seq_len, - int num_heads, - int num_kv_heads, - int head_dim, - int rotary_dim, - int pos_offset, - float theta_base, - void* stream -) { - int threads = rotary_dim / 2; - if (threads > BLOCK_SIZE) threads = BLOCK_SIZE; - dim3 grid_q(seq_len, num_heads); - dim3 grid_k(seq_len, num_kv_heads); - - rope_partial_kernel<<>>( - (half*)q, num_heads, head_dim, rotary_dim, pos_offset, theta_base - ); - rope_partial_kernel<<>>( - (half*)k, num_kv_heads, head_dim, rotary_dim, pos_offset, theta_base - ); -} diff --git a/crates/weaver-inference/src/lib.rs b/crates/weaver-inference/src/lib.rs deleted file mode 100644 index 5c9e8f52..00000000 --- a/crates/weaver-inference/src/lib.rs +++ /dev/null @@ -1,128 +0,0 @@ -//! Deprecated re-export shell — `weaver-inference`'s decoder-side -//! modules folded into `weaver-spu` per `docs/specs/weaver-spu-Spec.md` -//! PR-0.5.C. -//! -//! New code should import from `weaver_spu::decoder::*` (decoder -//! modules) or `weaver_spu::core::*` (orchestrator, vram, nvlink, -//! probe, gpu primitives). -//! -//! This crate stays in the workspace through the migration window so -//! existing consumers compile unchanged. Removed in PR-0.5.E along -//! with `weaver-embedding` and the rest of the legacy crates. - -// Module-scoped allow — this entire crate is a deprecated shell. -// Every re-export below points at a stable weaver-spu module and -// would otherwise need an individual `#[allow(deprecated)]`. The -// crate retires in PR-0.5.E. External consumers see deprecation -// warnings on their own `use` lines. -#![allow(deprecated)] - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::decoder::config; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::decoder::config; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::decoder::download; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::decoder::download; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::decoder::engine; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::decoder::engine; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::decoder::family; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::decoder::family; - -#[deprecated( - since = "0.1.0", - note = "renamed to weaver_spu::decoder::llama_cpp_backend; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::decoder::llama_cpp_backend as backend; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::decoder::model; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::decoder::model; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::decoder::model_profile; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::decoder::model_profile; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::decoder::multi_model; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::decoder::multi_model; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::decoder::runtime_config; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::decoder::runtime_config; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::decoder::server; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::decoder::server; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::decoder::startup; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::decoder::startup; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::decoder::suggest; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::decoder::suggest; - -#[cfg(feature = "gguf")] -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::decoder::gguf; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::decoder::gguf; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::core::gpu_orchestrator; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::core::gpu_orchestrator; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::core::nvlink; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::core::nvlink; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::core::probe; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::core::probe; - -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::core::vram; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::core::vram; - -#[cfg(feature = "cuda")] -#[deprecated( - since = "0.1.0", - note = "moved to weaver_spu::core::gpu; this re-export goes away in PR-0.5.E" -)] -pub use weaver_spu::core::gpu; diff --git a/crates/weaver-interface/Cargo.toml b/crates/weaver-interface/Cargo.toml index 2e2d2516..de003827 100644 --- a/crates/weaver-interface/Cargo.toml +++ b/crates/weaver-interface/Cargo.toml @@ -15,7 +15,12 @@ path = "src/main.rs" [features] default = [] -inference = ["dep:weaver-inference"] +# `inference` opts the daemon into the candle decoder + GGUF/CUDA +# backends (post-PR-0.5.E this maps onto weaver-spu's `cuda` and +# `gguf` features). Pre-PR-0.5.E this enabled `dep:weaver-inference` +# which is now removed; the feature now activates the corresponding +# weaver-spu features instead. +inference = ["weaver-spu/cuda", "weaver-spu/gguf"] # `viewer` pulls in weaver-analysis (manifest schema + load) so the # trace viewer can ingest the bench-emitted manifest.json companion. viewer = ["dep:ratatui", "dep:crossterm", "dep:weaver-analysis"] @@ -25,9 +30,11 @@ weaver-core = { workspace = true } weaver-database = { path = "../weaver-database" } weaver-demo = { path = "../weaver-demo" } weaver-trace = { path = "../weaver-trace" } -weaver-embedding = { workspace = true } - -weaver-inference = { path = "../weaver-inference", features = ["cuda", "gguf"], optional = true } +# `weaver-spu` carries the decoder + encoder runtime post-PR-0.5.C/D/E. +# The `inference` feature above forwards `cuda` and `gguf` features +# through to `weaver-spu`. `default-features = false` per the +# workspace dep declaration. +weaver-spu = { workspace = true } anyhow = { workspace = true } async-trait = { workspace = true } chrono = { workspace = true } diff --git a/crates/weaver-interface/src/harness.rs b/crates/weaver-interface/src/harness.rs index 7f8736e9..720c4003 100644 --- a/crates/weaver-interface/src/harness.rs +++ b/crates/weaver-interface/src/harness.rs @@ -6,7 +6,7 @@ //! //! Today this is a single subcommand tree (`embedder`) for inspecting and //! managing the embedder identity pin written by `weaver serve` on first -//! boot. See `weaver_embedding::pin` for the lock-file design. +//! boot. See `weaver_spu::core::pin` for the lock-file design. //! //! The commands here do not touch the running embedder or GPU state — they //! only read and write `/opt/weaver/state/embedder.pin.json`. Operators use @@ -18,7 +18,7 @@ use std::path::{Path, PathBuf}; use anyhow::{Context, Result, bail}; use clap::{Args, Subcommand}; -use weaver_embedding::pin::{DEFAULT_PIN_PATH, EmbedderPin, WriteIfAbsent}; +use weaver_spu::core::pin::{DEFAULT_PIN_PATH, EmbedderPin, WriteIfAbsent}; #[derive(Subcommand, Debug)] pub enum HarnessAction { diff --git a/crates/weaver-interface/src/lib.rs b/crates/weaver-interface/src/lib.rs index 701c31ca..1d01959b 100644 --- a/crates/weaver-interface/src/lib.rs +++ b/crates/weaver-interface/src/lib.rs @@ -6,11 +6,13 @@ pub mod auth; pub mod daemon_client; -// `embedder_pin` moved to `weaver-embedding::pin` per -// `embedder-oxidization-Spec.md` §5.1 (issue #166 / sprint -// Block A.4). Cohort-pin is an embedding-substrate concern, not -// a CLI concern; it lives alongside the loader paths it guards. -// External consumers import from `weaver_embedding::pin` now. +// `embedder_pin` lives at `weaver_spu::core::pin` per +// `docs/specs/weaver-spu-Spec.md` PR-0.5.D (relocated there from +// the prior `weaver-embedding::pin` home in the original +// embedder-oxidization sprint). Cohort-pin is an embedding- +// substrate concern, not a CLI concern; it lives alongside the +// loader paths it guards. External consumers import from +// `weaver_spu::core::pin`. pub mod kinds; pub mod protocol; pub mod server; diff --git a/crates/weaver-interface/src/model.rs b/crates/weaver-interface/src/model.rs index fbbfe638..09db3ef4 100644 --- a/crates/weaver-interface/src/model.rs +++ b/crates/weaver-interface/src/model.rs @@ -241,7 +241,7 @@ async fn handle_load( #[cfg(feature = "inference")] fn handle_probe(path: String) -> Result<()> { use std::path::Path; - use weaver_inference::probe::probe_gguf; + use weaver_spu::core::probe::probe_gguf; let probe = probe_gguf(Path::new(&path))?; @@ -304,7 +304,7 @@ fn handle_probe(path: String) -> Result<()> { #[cfg(feature = "inference")] fn handle_families() -> Result<()> { - use weaver_inference::family::{ModelMode, supported_families}; + use weaver_spu::decoder::family::{ModelMode, supported_families}; let families = supported_families(); if families.is_empty() { @@ -342,7 +342,7 @@ fn handle_families() -> Result<()> { #[cfg(feature = "inference")] fn handle_registry(config: String) -> Result<()> { use std::path::Path; - use weaver_inference::multi_model::MultiModelConfig; + use weaver_spu::decoder::multi_model::MultiModelConfig; let path = Path::new(&config); if !path.exists() { @@ -396,9 +396,9 @@ async fn handle_install( socket: String, ) -> Result<()> { use std::path::PathBuf; - use weaver_inference::download::download_gguf_file; - use weaver_inference::probe::probe_gguf; - use weaver_inference::suggest::suggest_entry; + use weaver_spu::decoder::download::download_gguf_file; + use weaver_spu::core::probe::probe_gguf; + use weaver_spu::decoder::suggest::suggest_entry; use crate::toml_edit::append_model_entry; @@ -465,7 +465,7 @@ async fn handle_install( #[cfg(feature = "inference")] fn resolve_entry_name( explicit: Option<&str>, - probe: &weaver_inference::probe::GgufProbe, + probe: &weaver_spu::core::probe::GgufProbe, filename: &str, ) -> String { if let Some(n) = explicit { @@ -487,9 +487,9 @@ fn resolve_entry_name( #[cfg(feature = "inference")] async fn load_after_install( socket: &str, - entry: &weaver_inference::multi_model::ModelEntry, + entry: &weaver_spu::decoder::multi_model::ModelEntry, ) -> Result<()> { - use weaver_inference::multi_model::GpuSpec; + use weaver_spu::decoder::multi_model::GpuSpec; let provider = OpenAiProvider::unix(socket, 120)?; @@ -754,7 +754,7 @@ mod tests { #[cfg(feature = "inference")] #[test] fn resolve_entry_name_prefers_explicit() { - let probe = weaver_inference::probe::GgufProbe { + let probe = weaver_spu::core::probe::GgufProbe { gguf_version: 3, arch: "qwen2".to_string(), name: Some("Qwen Coder 32B".to_string()), @@ -771,7 +771,7 @@ mod tests { #[cfg(feature = "inference")] #[test] fn resolve_entry_name_uses_probe_name_when_available() { - let probe = weaver_inference::probe::GgufProbe { + let probe = weaver_spu::core::probe::GgufProbe { gguf_version: 3, arch: "qwen2".to_string(), name: Some("qwen25-coder".to_string()), @@ -788,7 +788,7 @@ mod tests { #[cfg(feature = "inference")] #[test] fn resolve_entry_name_falls_back_to_filename_stem() { - let probe = weaver_inference::probe::GgufProbe { + let probe = weaver_spu::core::probe::GgufProbe { gguf_version: 3, arch: "qwen2".to_string(), name: None, @@ -805,7 +805,7 @@ mod tests { #[cfg(feature = "inference")] #[test] fn resolve_entry_name_ignores_whitespace_only_probe_name() { - let probe = weaver_inference::probe::GgufProbe { + let probe = weaver_spu::core::probe::GgufProbe { gguf_version: 3, arch: "qwen2".to_string(), name: Some(" ".to_string()), diff --git a/crates/weaver-interface/src/serve.rs b/crates/weaver-interface/src/serve.rs index 85b02d18..e1cd69f4 100644 --- a/crates/weaver-interface/src/serve.rs +++ b/crates/weaver-interface/src/serve.rs @@ -10,11 +10,11 @@ use std::time::Duration; use anyhow::{Result, bail}; use clap::Args; -use weaver_embedding::grpc_client::{ +use weaver_spu::encoder::grpc_client_legacy::{ DEFAULT_EMBEDDER_SOCKET, EmbeddingClient, EmbeddingClientConfig, EmbeddingEndpoint, }; -use weaver_embedding::pin::{self, DEFAULT_PIN_PATH, EmbedderPin, VerifyOutcome}; +use weaver_spu::core::pin::{self, DEFAULT_PIN_PATH, EmbedderPin, VerifyOutcome}; const EMBEDDER_PROBE_TIMEOUT: Duration = Duration::from_secs(3); /// Total wall-clock budget for establishing an embedder probe. Under systemd, /// weaver-embedder.service is Type=simple and declared active the moment @@ -45,9 +45,9 @@ pub struct ServeArgs { pub async fn handle_serve(args: ServeArgs) -> Result<()> { // Load config or use empty defaults (server starts with no models) let mut config = if let Some(ref path) = args.config { - weaver_inference::multi_model::MultiModelConfig::from_file(path)? + weaver_spu::decoder::multi_model::MultiModelConfig::from_file(path)? } else { - weaver_inference::multi_model::MultiModelConfig::default() + weaver_spu::decoder::multi_model::MultiModelConfig::default() }; // Apply CLI overrides @@ -64,7 +64,7 @@ pub async fn handle_serve(args: ServeArgs) -> Result<()> { } // Build server state (loads any models declared in config) - let (state, loaded_names) = weaver_inference::startup::build_server_state(&config)?; + let (state, loaded_names) = weaver_spu::decoder::startup::build_server_state(&config)?; let model_count = loaded_names.len(); if model_count > 0 { @@ -143,7 +143,7 @@ pub async fn handle_serve(args: ServeArgs) -> Result<()> { } } - weaver_inference::server::serve( + weaver_spu::decoder::server::serve( state, Some(socket_path.as_path()), listen_addr.as_deref(), @@ -166,7 +166,7 @@ pub async fn handle_serve(args: ServeArgs) -> Result<()> { /// Once we're past startup, the refuse-to-start-when-pin-exists check in /// `handle_serve` catches a truly-wedged embedder: probe returns None → /// refuse, rather than silently running without the pin verified. -async fn probe_embedder(socket: &str) -> Option { +async fn probe_embedder(socket: &str) -> Option { let client_config = EmbeddingClientConfig { endpoint: EmbeddingEndpoint::Unix(PathBuf::from(socket)), connect_timeout: EMBEDDER_PROBE_TIMEOUT, @@ -215,7 +215,7 @@ async fn probe_embedder(socket: &str) -> Option Result<()> { let live = EmbedderPin::now( &info.model_name, @@ -256,7 +256,7 @@ mod tests { use super::*; use clap::Parser; use tempfile::TempDir; - use weaver_embedding::proto::embedding::InfoResponse; + use weaver_spu::proto::embedding::InfoResponse; #[derive(Parser, Debug)] struct TestCli { diff --git a/crates/weaver-interface/src/toml_edit.rs b/crates/weaver-interface/src/toml_edit.rs index 65cdb815..12aeef53 100644 --- a/crates/weaver-interface/src/toml_edit.rs +++ b/crates/weaver-interface/src/toml_edit.rs @@ -22,7 +22,7 @@ use std::path::{Path, PathBuf}; use std::time::{SystemTime, UNIX_EPOCH}; use serde::Serialize; -use weaver_inference::multi_model::{ModelEntry, MultiModelConfig}; +use weaver_spu::decoder::multi_model::{ModelEntry, MultiModelConfig}; /// Result of [`append_model_entry`]: the path written plus the block of TOML /// text that was appended (useful for CLI output). @@ -224,8 +224,8 @@ fn atomic_write(path: &Path, bytes: &[u8]) -> anyhow::Result<()> { mod tests { use super::*; use tempfile::TempDir; - use weaver_inference::family::{ModelMode, Pooling}; - use weaver_inference::multi_model::{BackendKind, GpuSpec}; + use weaver_spu::decoder::family::{ModelMode, Pooling}; + use weaver_spu::decoder::multi_model::{BackendKind, GpuSpec}; fn gguf_entry(name: &str, path: &str) -> ModelEntry { ModelEntry { diff --git a/crates/weaver-spu/src/decoder/server/prompt.rs b/crates/weaver-spu/src/decoder/server/prompt.rs index 137b9040..58b898ef 100644 --- a/crates/weaver-spu/src/decoder/server/prompt.rs +++ b/crates/weaver-spu/src/decoder/server/prompt.rs @@ -1,3 +1,10 @@ +// Module-scoped allow — pre-existing hidden-lifetime patterns in +// this file surface under weaver-spu's `#![warn(rust_2018_idioms)]` +// (downgraded from `deny` in PR-0.5.C when the file was folded in +// from weaver-inference). Fixing them is a future cleanup PR; out +// of scope for the no-op consolidation in PR-0.5.E. +#![allow(elided_lifetimes_in_paths)] + //! Prompt building and tool-call parsing — pure functions, no CUDA dependency. //! //! These functions convert OpenAI-format messages into model-specific prompt diff --git a/crates/weaver-spu/src/encoder/gguf_backend.rs b/crates/weaver-spu/src/encoder/gguf_backend.rs index 2999e078..4df01d96 100644 --- a/crates/weaver-spu/src/encoder/gguf_backend.rs +++ b/crates/weaver-spu/src/encoder/gguf_backend.rs @@ -234,17 +234,15 @@ fn l2_normalize(v: &[f32]) -> Vec { #[cfg(test)] mod tests { use super::*; - // Tests load a real GGUF via `weaver_inference`'s decoder - // model-loader because that's where `init_backend` / `load_model` - // / `GgufModelParams` live (per the post-Phase-0 ontology: - // model-runtime primitives stay in `weaver_inference`; - // embedding-pipeline composition lives here in - // `weaver_embedding`). `weaver-inference` is therefore a - // dev-dependency of `weaver-embedding`. A later PR (Phase 1 - // — the `EmbedderClient` impl) makes it a regular dependency - // when the production embedder code calls `load_model` itself. + // Tests load a real GGUF via the decoder-side model-loader + // primitives (`init_backend` / `load_model` / `GgufModelParams` + // in `weaver_spu::decoder::gguf`). Both halves of the SPU now + // live in this crate, so this is a sibling-module import — no + // cross-crate dev-dep wiring needed. The legacy comment about + // `weaver-inference` and `weaver-embedding` being separate + // crates is gone with PR-0.5.E. use std::sync::Mutex; - use weaver_inference::gguf::{GgufModelParams, init_backend, load_model}; + use weaver_spu::decoder::gguf::{GgufModelParams, init_backend, load_model}; const JINA_RETRIEVAL_Q6K: &str = "/opt/weaver/models/jina-embeddings-v4-text-retrieval-Q6_K.gguf";