Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
658 changes: 31 additions & 627 deletions Cargo.lock

Large diffs are not rendered by default.

134 changes: 12 additions & 122 deletions crates/weaver-embedding/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,131 +4,21 @@ version.workspace = true
edition.workspace = true
rust-version.workspace = true
license.workspace = true
description = "Embedding pipelines (chunking, prompt-prefix routing, pooling, late-chunking, gRPC client, native llama.cpp backend) for WeaverTools."

# Crate skeleton landed in PR #248. Subsequent move PRs in
# Block A of the unified-SPU sprint pull existing pieces in:
#
# - `late_chunk.rs` from `weaver-database::chunking::late` (PR #249).
# - `gguf_backend.rs` from `weaver-inference::gguf_embed` (this PR).
# - `grpc_client.rs` from `weaver-database::persephone::embedding` (next).
# - `pin.rs` from `weaver-interface::embedder_pin` (next).
# - The `Embedder` trait + the new in-process Rust backend
# (`embedder_client.rs`) implementing it (Phase 1).
description = "Deprecated re-export shell — content folded into weaver-spu (encoder side) + weaver-core (Embedder trait) per PR-0.5.B + PR-0.5.D; removed in PR-0.5.E."

[features]
# Forward feature flags to weaver-spu so consumers that opt into
# `gguf` (legacy GGUF encoder backend + Persephone gRPC client) via
# this crate still get the corresponding weaver-spu feature enabled.
# Removed in PR-0.5.E.
default = []
# Native llama.cpp GGUF embedding backend. Behind a feature flag
# so the gRPC-only consumer path (Python embedder during the
# migration window) doesn't pull `llama-cpp-2` and its CUDA
# build dependencies. Mirrors the same-named feature in
# `weaver-inference` — both crates have an independent `gguf`
# feature for their respective decoder / encoder use of llama.cpp.
gguf = ["dep:llama-cpp-2"]

# candle-based encoder backend (Block C of the unified-SPU
# sprint, per `embedder-oxidization-Spec.md` §6). Pulls candle
# for safetensors loading + FP16 forward pass and the HF
# `tokenizers` crate. CPU-only by default; the operator opts into
# CUDA via the `candle-cuda` feature on top. Gated separately
# from `gguf` so the migration-window builds (gRPC-only
# consumers) don't pull candle.
candle = ["dep:candle-core", "dep:candle-nn", "dep:tokenizers"]
# CUDA acceleration for the candle backend. Implies `candle`.
# Pulls `nvcc` + `cudnn` build deps from candle-{core,nn}; the
# default `candle` feature stays CPU so workspace `cargo check`
# without CUDA toolchain still passes.
candle-cuda = ["candle", "candle-core/cuda", "candle-nn/cuda"]
gguf = ["weaver-spu/gguf"]

[dependencies]
# `weaver-core` carries the `Embedder` trait + associated types
# post-PR-0.5.B. `grpc_client::EmbeddingClient`'s `Embedder` impl
# is in this crate (allowed by orphan rules: trait in weaver-core,
# concrete type in this crate). `embedder.rs` re-exports the trait
# during the transition window so existing consumers compile
# unchanged.
# Sole runtime deps: weaver-core (for the relocated `Embedder` trait
# + types) and weaver-spu (for the relocated late_chunk / pin /
# legacy gRPC + GGUF backends + proto module).
# `default-features = false` on weaver-spu to avoid feature leakage —
# this shell crate forwards features explicitly via the `gguf` flag.
weaver-core = { workspace = true }

# `tracing` and `anyhow` are unconditional — `late_chunk` is a
# pure-stdlib algorithm but we want module-level instrumentation
# hooks and ergonomic error returns available for any future
# addition without re-gating. The `gguf_backend` (feature-gated)
# uses `anyhow::Result` for its public surface today; future
# native-backend code in this crate will share the same error
# convention.
tracing = { workspace = true }
anyhow = { workspace = true }
thiserror = { workspace = true }

# `async-trait` for the [`Embedder`] trait's `async fn` methods.
# Object-safe-via-`#[async_trait]` so consumers can hold
# `Arc<dyn Embedder>`.
async-trait = { workspace = true }

# `pin` (the cohort-identity lock-file module) needs serde for
# the `EmbedderPin` struct, serde_json for the JSON read/write
# path, and chrono for stamping `pinned_at` rfc3339.
serde = { workspace = true }
serde_json = { workspace = true }
chrono = { workspace = true }

# `grpc_client` (the Persephone embedding gRPC client) — same
# transport stack `weaver-database`'s removed `persephone`
# module used. The Unix-socket transport uses `hyper-util` +
# `tower::service_fn` to wrap a UDS connection into a tonic
# `Channel`. `prost` carries the generated proto messages.
# `tokio` (UnixStream) handles the actual UDS connect.
tonic = { workspace = true }
prost = { workspace = true }
prost-types = { workspace = true }
hyper-util = { workspace = true }
tower = { workspace = true }
tokio = { workspace = true }

# Native llama.cpp backend. Same version + features as
# `weaver-inference`'s `llama-cpp-2` declaration; both crates
# load `llama-cpp-2` independently but the singleton `LlamaBackend`
# they share is still process-global (per
# `embedder-oxidization-Spec.md` §4.4).
llama-cpp-2 = { version = "0.1.143", optional = true, default-features = false, features = ["cuda", "dynamic-link", "mtmd"] }

# candle FP16 safetensors backend (gated behind the `candle`
# feature). `candle-core` carries Tensor / Device / DType;
# `candle-nn` carries the layer primitives (linear, layer-norm,
# embedding) used by the XLM-RoBERTa encoder + LoRA adapter
# wiring per `embedder-oxidization-Spec.md` §6.2 / §6.3. The
# `candle-cuda` feature on this crate enables `cuda` on both
# candle crates simultaneously; default `candle` stays CPU so a
# bare `cargo check --features candle` works without nvcc.
candle-core = { workspace = true, optional = true }
candle-nn = { workspace = true, optional = true }

# HF `tokenizers` crate — production reads the same
# `tokenizer.json` the Python reference uses (Spec §7.1).
tokenizers = { workspace = true, optional = true }

[build-dependencies]
# Generates `crate::proto::embedding::*` from the embedding.proto
# at workspace `proto/persephone/embedding/embedding.proto`. See
# build.rs for the rationale on duplicate generation alongside
# `weaver-database`'s build.rs during the migration window.
tonic-build = { workspace = true }

[dev-dependencies]
# Tests in `gguf_backend.rs` load a real GGUF via
# `weaver-inference::gguf::{init_backend, load_model,
# GgufModelParams}` — those primitives belong to the model-runtime
# layer (`weaver-inference`), not the embedding-pipeline layer
# (this crate). The dev-dep keeps tests passing in their new home.
# Phase 1's `EmbedderClient` impl will make this a regular
# dependency.
weaver-inference = { workspace = true, features = ["gguf"] }

# `pin` tests use `tempfile::TempDir` for hermetic file-system
# fixtures.
tempfile = { workspace = true }

# `grpc_client` integration tests stand up a mock tonic server
# over a `TcpListener`-derived stream. `tokio-stream` provides
# the wrapper.
tokio-stream = { workspace = true }
weaver-spu = { workspace = true, default-features = false }
44 changes: 0 additions & 44 deletions crates/weaver-embedding/build.rs

This file was deleted.

30 changes: 0 additions & 30 deletions crates/weaver-embedding/src/embedder.rs

This file was deleted.

Loading