toddwbucy · toddwbucy · May 5, 2026 · May 5, 2026 · May 5, 2026 · May 5, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/weaver-embedding/Cargo.toml b/crates/weaver-embedding/Cargo.toml
@@ -4,131 +4,21 @@ version.workspace = true
 edition.workspace = true
 rust-version.workspace = true
 license.workspace = true
-description = "Embedding pipelines (chunking, prompt-prefix routing, pooling, late-chunking, gRPC client, native llama.cpp backend) for WeaverTools."
-
-# Crate skeleton landed in PR #248. Subsequent move PRs in
-# Block A of the unified-SPU sprint pull existing pieces in:
-#
-# - `late_chunk.rs` from `weaver-database::chunking::late` (PR #249).
-# - `gguf_backend.rs` from `weaver-inference::gguf_embed` (this PR).
-# - `grpc_client.rs` from `weaver-database::persephone::embedding` (next).
-# - `pin.rs` from `weaver-interface::embedder_pin` (next).
-# - The `Embedder` trait + the new in-process Rust backend
-#   (`embedder_client.rs`) implementing it (Phase 1).
+description = "Deprecated re-export shell — content folded into weaver-spu (encoder side) + weaver-core (Embedder trait) per PR-0.5.B + PR-0.5.D; removed in PR-0.5.E."
 
 [features]
+# Forward feature flags to weaver-spu so consumers that opt into
+# `gguf` (legacy GGUF encoder backend + Persephone gRPC client) via
+# this crate still get the corresponding weaver-spu feature enabled.
+# Removed in PR-0.5.E.
 default = []
-# Native llama.cpp GGUF embedding backend. Behind a feature flag
-# so the gRPC-only consumer path (Python embedder during the
-# migration window) doesn't pull `llama-cpp-2` and its CUDA
-# build dependencies. Mirrors the same-named feature in
-# `weaver-inference` — both crates have an independent `gguf`
-# feature for their respective decoder / encoder use of llama.cpp.
-gguf = ["dep:llama-cpp-2"]
-
-# candle-based encoder backend (Block C of the unified-SPU
-# sprint, per `embedder-oxidization-Spec.md` §6). Pulls candle
-# for safetensors loading + FP16 forward pass and the HF
-# `tokenizers` crate. CPU-only by default; the operator opts into
-# CUDA via the `candle-cuda` feature on top. Gated separately
-# from `gguf` so the migration-window builds (gRPC-only
-# consumers) don't pull candle.
-candle = ["dep:candle-core", "dep:candle-nn", "dep:tokenizers"]
-# CUDA acceleration for the candle backend. Implies `candle`.
-# Pulls `nvcc` + `cudnn` build deps from candle-{core,nn}; the
-# default `candle` feature stays CPU so workspace `cargo check`
-# without CUDA toolchain still passes.
-candle-cuda = ["candle", "candle-core/cuda", "candle-nn/cuda"]
+gguf = ["weaver-spu/gguf"]
 
 [dependencies]
-# `weaver-core` carries the `Embedder` trait + associated types
-# post-PR-0.5.B. `grpc_client::EmbeddingClient`'s `Embedder` impl
-# is in this crate (allowed by orphan rules: trait in weaver-core,
-# concrete type in this crate). `embedder.rs` re-exports the trait
-# during the transition window so existing consumers compile
-# unchanged.
+# Sole runtime deps: weaver-core (for the relocated `Embedder` trait
+# + types) and weaver-spu (for the relocated late_chunk / pin /
+# legacy gRPC + GGUF backends + proto module).
+# `default-features = false` on weaver-spu to avoid feature leakage —
+# this shell crate forwards features explicitly via the `gguf` flag.
 weaver-core = { workspace = true }
-
-# `tracing` and `anyhow` are unconditional — `late_chunk` is a
-# pure-stdlib algorithm but we want module-level instrumentation
-# hooks and ergonomic error returns available for any future
-# addition without re-gating. The `gguf_backend` (feature-gated)
-# uses `anyhow::Result` for its public surface today; future
-# native-backend code in this crate will share the same error
-# convention.
-tracing = { workspace = true }
-anyhow = { workspace = true }
-thiserror = { workspace = true }
-
-# `async-trait` for the [`Embedder`] trait's `async fn` methods.
-# Object-safe-via-`#[async_trait]` so consumers can hold
-# `Arc<dyn Embedder>`.
-async-trait = { workspace = true }
-
-# `pin` (the cohort-identity lock-file module) needs serde for
-# the `EmbedderPin` struct, serde_json for the JSON read/write
-# path, and chrono for stamping `pinned_at` rfc3339.
-serde = { workspace = true }
-serde_json = { workspace = true }
-chrono = { workspace = true }
-
-# `grpc_client` (the Persephone embedding gRPC client) — same
-# transport stack `weaver-database`'s removed `persephone`
-# module used. The Unix-socket transport uses `hyper-util` +
-# `tower::service_fn` to wrap a UDS connection into a tonic
-# `Channel`. `prost` carries the generated proto messages.
-# `tokio` (UnixStream) handles the actual UDS connect.
-tonic = { workspace = true }
-prost = { workspace = true }
-prost-types = { workspace = true }
-hyper-util = { workspace = true }
-tower = { workspace = true }
-tokio = { workspace = true }
-
-# Native llama.cpp backend. Same version + features as
-# `weaver-inference`'s `llama-cpp-2` declaration; both crates
-# load `llama-cpp-2` independently but the singleton `LlamaBackend`
-# they share is still process-global (per
-# `embedder-oxidization-Spec.md` §4.4).
-llama-cpp-2 = { version = "0.1.143", optional = true, default-features = false, features = ["cuda", "dynamic-link", "mtmd"] }
-
-# candle FP16 safetensors backend (gated behind the `candle`
-# feature). `candle-core` carries Tensor / Device / DType;
-# `candle-nn` carries the layer primitives (linear, layer-norm,
-# embedding) used by the XLM-RoBERTa encoder + LoRA adapter
-# wiring per `embedder-oxidization-Spec.md` §6.2 / §6.3. The
-# `candle-cuda` feature on this crate enables `cuda` on both
-# candle crates simultaneously; default `candle` stays CPU so a
-# bare `cargo check --features candle` works without nvcc.
-candle-core = { workspace = true, optional = true }
-candle-nn = { workspace = true, optional = true }
-
-# HF `tokenizers` crate — production reads the same
-# `tokenizer.json` the Python reference uses (Spec §7.1).
-tokenizers = { workspace = true, optional = true }
-
-[build-dependencies]
-# Generates `crate::proto::embedding::*` from the embedding.proto
-# at workspace `proto/persephone/embedding/embedding.proto`. See
-# build.rs for the rationale on duplicate generation alongside
-# `weaver-database`'s build.rs during the migration window.
-tonic-build = { workspace = true }
-
-[dev-dependencies]
-# Tests in `gguf_backend.rs` load a real GGUF via
-# `weaver-inference::gguf::{init_backend, load_model,
-# GgufModelParams}` — those primitives belong to the model-runtime
-# layer (`weaver-inference`), not the embedding-pipeline layer
-# (this crate). The dev-dep keeps tests passing in their new home.
-# Phase 1's `EmbedderClient` impl will make this a regular
-# dependency.
-weaver-inference = { workspace = true, features = ["gguf"] }
-
-# `pin` tests use `tempfile::TempDir` for hermetic file-system
-# fixtures.
-tempfile = { workspace = true }
-
-# `grpc_client` integration tests stand up a mock tonic server
-# over a `TcpListener`-derived stream. `tokio-stream` provides
-# the wrapper.
-tokio-stream = { workspace = true }
+weaver-spu = { workspace = true, default-features = false }
diff --git a/crates/weaver-embedding/build.rs b/crates/weaver-embedding/build.rs
diff --git a/crates/weaver-embedding/src/embedder.rs b/crates/weaver-embedding/src/embedder.rs