diff --git a/crates/weaver-spu/build.rs b/crates/weaver-spu/build.rs
index e6e1cac2..da55046c 100644
--- a/crates/weaver-spu/build.rs
+++ b/crates/weaver-spu/build.rs
@@ -18,29 +18,7 @@ use std::path::PathBuf;
 
 fn main() -> Result<(), Box<dyn std::error::Error>> {
     if std::env::var("CARGO_FEATURE_CUDA").is_ok() {
-        // The `cuda` feature gates the legacy cudarc-based decoder path
-        // (slated for retirement in Phase 3 cleanup per Cargo.toml). The
-        // hand-written CUDA kernels for that path live at
-        // `kernels/transformer.cu`; they were lost in the PR-0.5.E
-        // shell-crate consolidation. Skip the compile gracefully when
-        // the file is absent so `--features inference` builds aren't
-        // broken on main pending a separate kernel-restore PR.
-        let manifest_dir = std::env::var("CARGO_MANIFEST_DIR")
-            .expect("CARGO_MANIFEST_DIR not set");
-        let kernel = PathBuf::from(&manifest_dir).join("kernels/transformer.cu");
-        if kernel.exists() {
-            compile_cuda_kernels();
-        } else {
-            println!(
-                "cargo:warning=weaver-spu: skipping CUDA kernel compile — \
-                 kernels/transformer.cu absent (pre-existing regression \
-                 from PR-0.5.E; cudarc decoder path will fail at link \
-                 if its symbols are referenced)"
-            );
-            // Still emit the rerun-if-changed so a future kernel restore
-            // triggers a rebuild.
-            println!("cargo:rerun-if-changed=kernels/transformer.cu");
-        }
+        compile_cuda_kernels();
     }
     // Persephone proto is always compiled — `encoder::grpc_client_legacy`
     // is the production embedder backend during the migration window
diff --git a/crates/weaver-spu/kernels/transformer.cu b/crates/weaver-spu/kernels/transformer.cu
new file mode 100644
index 00000000..9eba1359
--- /dev/null
+++ b/crates/weaver-spu/kernels/transformer.cu
@@ -0,0 +1,1266 @@
+// Transformer inference kernels for Qwen/Llama/Mistral/Gemma4 architectures.
+//
+// All kernels operate on f16 data (__half). Element-wise operations
+// accumulate in f32 internally for numerical stability.
+//
+// Each __global__ kernel has a corresponding extern "C" host-side
+// launcher function that the Rust FFI calls. The launcher computes
+// grid/block dimensions and invokes the kernel.
+//
+// Compiled to SASS for sm_86 (A6000) and sm_89 (RTX Ada) via build.rs.
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <math.h>
+
+// Block size for 1D kernels
+#define BLOCK_SIZE 256
+
+// ============================================================================
+// RMSNorm: x_norm = x * rsqrt(mean(x^2) + eps) * weight
+// ============================================================================
+
+static __global__ void rmsnorm_kernel(
+    const half* __restrict__ input,
+    const half* __restrict__ weight,
+    half* __restrict__ output,
+    int hidden_size,
+    float eps
+) {
+    int row = blockIdx.x;
+    int tid = threadIdx.x;
+    int stride = blockDim.x;
+
+    const half* x = input + (long long)row * hidden_size;
+    half* out = output + (long long)row * hidden_size;
+
+    extern __shared__ float sdata[];
+
+    float local_sum = 0.0f;
+    for (int i = tid; i < hidden_size; i += stride) {
+        float val = __half2float(x[i]);
+        local_sum += val * val;
+    }
+    sdata[tid] = local_sum;
+    __syncthreads();
+
+    for (int s = stride / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            sdata[tid] += sdata[tid + s];
+        }
+        __syncthreads();
+    }
+
+    float rms = rsqrtf(sdata[0] / (float)hidden_size + eps);
+
+    for (int i = tid; i < hidden_size; i += stride) {
+        float val = __half2float(x[i]) * rms;
+        out[i] = __float2half(val * __half2float(weight[i]));
+    }
+}
+
+extern "C" void launch_rmsnorm(
+    const void* input,
+    const void* weight,
+    void* output,
+    int batch_size,
+    int hidden_size,
+    float eps,
+    void* stream
+) {
+    int threads = (hidden_size < BLOCK_SIZE) ? hidden_size : BLOCK_SIZE;
+    int smem = threads * sizeof(float);
+    rmsnorm_kernel<<<batch_size, threads, smem, (cudaStream_t)stream>>>(
+        (const half*)input, (const half*)weight, (half*)output,
+        hidden_size, eps
+    );
+}
+
+// ============================================================================
+// RoPE: Rotary Position Embedding (rotary-half convention)
+//
+// Qwen/Llama/Mistral use rotary-half: pairs are (d[i], d[i + half_dim])
+// NOT interleaved pairs (d[2i], d[2i+1]).
+//
+// For each pair:
+//   new_d[i]            = d[i] * cos(angle) - d[i + half_dim] * sin(angle)
+//   new_d[i + half_dim] = d[i] * sin(angle) + d[i + half_dim] * cos(angle)
+// ============================================================================
+
+static __global__ void rope_kernel(
+    half* __restrict__ qk,      // [seq_len, num_heads, head_dim]
+    int num_heads,
+    int head_dim,
+    int pos_offset,
+    float theta_base
+) {
+    int seq_idx = blockIdx.x;
+    int head = blockIdx.y;
+    int tid = threadIdx.x;
+    int half_dim = head_dim / 2;
+
+    if (tid >= half_dim || head >= num_heads) return;
+
+    int pos = seq_idx + pos_offset;
+    float freq = 1.0f / powf(theta_base, (float)(2 * tid) / (float)head_dim);
+    float angle = (float)pos * freq;
+    float cos_val = cosf(angle);
+    float sin_val = sinf(angle);
+
+    long long base_idx = ((long long)seq_idx * num_heads + head) * head_dim;
+    float v0 = __half2float(qk[base_idx + tid]);
+    float v1 = __half2float(qk[base_idx + tid + half_dim]);
+    qk[base_idx + tid]            = __float2half(v0 * cos_val - v1 * sin_val);
+    qk[base_idx + tid + half_dim] = __float2half(v0 * sin_val + v1 * cos_val);
+}
+
+extern "C" void launch_rope(
+    void* q,
+    void* k,
+    int seq_len,
+    int num_heads,
+    int num_kv_heads,
+    int head_dim,
+    int pos_offset,
+    float theta_base,
+    void* stream
+) {
+    int threads = head_dim / 2;
+    if (threads > BLOCK_SIZE) threads = BLOCK_SIZE;
+    dim3 grid_q(seq_len, num_heads);
+    dim3 grid_k(seq_len, num_kv_heads);
+
+    rope_kernel<<<grid_q, threads, 0, (cudaStream_t)stream>>>(
+        (half*)q, num_heads, head_dim, pos_offset, theta_base
+    );
+    rope_kernel<<<grid_k, threads, 0, (cudaStream_t)stream>>>(
+        (half*)k, num_kv_heads, head_dim, pos_offset, theta_base
+    );
+}
+
+// ============================================================================
+// SwiGLU: output = silu(gate) * up, where silu(x) = x * sigmoid(x)
+// ============================================================================
+
+static __global__ void swiglu_kernel(
+    const half* __restrict__ gate,
+    const half* __restrict__ up,
+    half* __restrict__ output,
+    int total_elements
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_elements) return;
+
+    float g = __half2float(gate[idx]);
+    float u = __half2float(up[idx]);
+    float silu_g = g / (1.0f + expf(-g));
+    output[idx] = __float2half(silu_g * u);
+}
+
+extern "C" void launch_swiglu(
+    const void* gate,
+    const void* up,
+    void* output,
+    int total_elements,
+    void* stream
+) {
+    int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    swiglu_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
+        (const half*)gate, (const half*)up, (half*)output, total_elements
+    );
+}
+
+// ============================================================================
+// Residual add: output += residual (in-place)
+// ============================================================================
+
+static __global__ void residual_add_kernel(
+    half* __restrict__ output,
+    const half* __restrict__ residual,
+    int total_elements
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_elements) return;
+    float val = __half2float(output[idx]) + __half2float(residual[idx]);
+    output[idx] = __float2half(val);
+}
+
+extern "C" void launch_residual_add(
+    void* output,
+    const void* residual,
+    int total_elements,
+    void* stream
+) {
+    int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    residual_add_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
+        (half*)output, (const half*)residual, total_elements
+    );
+}
+
+// ============================================================================
+// Bias add: output[pos * dim + d] += bias[d] for all positions
+// ============================================================================
+
+static __global__ void bias_add_kernel(
+    half* __restrict__ output,
+    const half* __restrict__ bias,
+    int dim,
+    int seq_len
+) {
+    int pos = blockIdx.x;
+    int tid = threadIdx.x;
+    int stride = blockDim.x;
+
+    if (pos >= seq_len) return;
+
+    half* out = output + (long long)pos * dim;
+    for (int d = tid; d < dim; d += stride) {
+        float val = __half2float(out[d]) + __half2float(bias[d]);
+        out[d] = __float2half(val);
+    }
+}
+
+extern "C" void launch_bias_add(
+    void* output,
+    const void* bias,
+    int dim,
+    int seq_len,
+    void* stream
+) {
+    int threads = (dim < BLOCK_SIZE) ? dim : BLOCK_SIZE;
+    bias_add_kernel<<<seq_len, threads, 0, (cudaStream_t)stream>>>(
+        (half*)output, (const half*)bias, dim, seq_len
+    );
+}
+
+// ============================================================================
+// Embedding lookup: output[i] = embed_table[token_ids[i]]
+// ============================================================================
+
+static __global__ void embedding_kernel(
+    const half* __restrict__ embed_table,
+    const int* __restrict__ token_ids,
+    half* __restrict__ output,
+    int hidden_size,
+    int seq_len
+) {
+    int pos = blockIdx.x;
+    int tid = threadIdx.x;
+    int stride = blockDim.x;
+
+    if (pos >= seq_len) return;
+
+    int token_id = token_ids[pos];
+    const half* row = embed_table + (long long)token_id * hidden_size;
+    half* out = output + (long long)pos * hidden_size;
+
+    for (int i = tid; i < hidden_size; i += stride) {
+        out[i] = row[i];
+    }
+}
+
+extern "C" void launch_embedding(
+    const void* embed_table,
+    const int* token_ids,
+    void* output,
+    int hidden_size,
+    int seq_len,
+    void* stream
+) {
+    int threads = (hidden_size < BLOCK_SIZE) ? hidden_size : BLOCK_SIZE;
+    embedding_kernel<<<seq_len, threads, 0, (cudaStream_t)stream>>>(
+        (const half*)embed_table, token_ids, (half*)output,
+        hidden_size, seq_len
+    );
+}
+
+// ============================================================================
+// Causal attention (single-head, strided access for GQA)
+//
+// Q, K, V, output may be strided — heads are interleaved within each
+// position in the multi-head tensor. The stride parameters tell the
+// kernel how many elements to skip between consecutive positions.
+//
+// For prefill (seq_len > 1): full QK^T + causal mask + softmax + V
+// For decode (seq_len == 1): dot product attention against KV cache
+//
+// This is a simple implementation. For production, use FlashAttention.
+// ============================================================================
+
+static __global__ void attention_scores_kernel(
+    const half* __restrict__ q,      // [seq_len, ...] one head, strided
+    const half* __restrict__ k,      // [total_len, ...] one KV head, strided
+    float* __restrict__ scores,      // [seq_len, total_len]
+    int seq_len,
+    int total_len,
+    int head_dim,
+    int q_stride,                    // elements between positions in Q
+    int kv_stride,                   // elements between positions in K
+    float scale
+) {
+    int q_pos = blockIdx.x;
+    int tid = threadIdx.x;
+    int stride = blockDim.x;
+
+    if (q_pos >= seq_len) return;
+
+    const half* q_row = q + (long long)q_pos * q_stride;
+
+    for (int k_pos = tid; k_pos < total_len; k_pos += stride) {
+        // Causal mask: only attend to positions <= q_pos + (total_len - seq_len)
+        int q_abs = q_pos + (total_len - seq_len);
+        if (k_pos > q_abs) {
+            scores[q_pos * total_len + k_pos] = -1e9f;
+            continue;
+        }
+
+        const half* k_row = k + (long long)k_pos * kv_stride;
+        float dot = 0.0f;
+        for (int d = 0; d < head_dim; d++) {
+            dot += __half2float(q_row[d]) * __half2float(k_row[d]);
+        }
+        scores[q_pos * total_len + k_pos] = dot * scale;
+    }
+}
+
+static __global__ void softmax_kernel(
+    float* __restrict__ data,
+    int cols
+) {
+    int row = blockIdx.x;
+    int tid = threadIdx.x;
+    int stride = blockDim.x;
+    float* row_data = data + (long long)row * cols;
+
+    extern __shared__ float sdata[];
+
+    // Find max
+    float local_max = -1e30f;
+    for (int i = tid; i < cols; i += stride) {
+        if (row_data[i] > local_max) local_max = row_data[i];
+    }
+    sdata[tid] = local_max;
+    __syncthreads();
+
+    for (int s = stride / 2; s > 0; s >>= 1) {
+        if (tid < s) sdata[tid] = fmaxf(sdata[tid], sdata[tid + s]);
+        __syncthreads();
+    }
+    float max_val = sdata[0];
+
+    // Sum of exp
+    float local_sum = 0.0f;
+    for (int i = tid; i < cols; i += stride) {
+        row_data[i] = expf(row_data[i] - max_val);
+        local_sum += row_data[i];
+    }
+    sdata[tid] = local_sum;
+    __syncthreads();
+
+    for (int s = stride / 2; s > 0; s >>= 1) {
+        if (tid < s) sdata[tid] += sdata[tid + s];
+        __syncthreads();
+    }
+
+    float inv_sum = 1.0f / sdata[0];
+    for (int i = tid; i < cols; i += stride) {
+        row_data[i] *= inv_sum;
+    }
+}
+
+static __global__ void attention_output_kernel(
+    const float* __restrict__ scores, // [seq_len, total_len]
+    const half* __restrict__ v,       // [total_len, ...] one KV head, strided
+    half* __restrict__ output,        // [seq_len, ...] one head, strided
+    int seq_len,
+    int total_len,
+    int head_dim,
+    int kv_stride,                    // elements between positions in V
+    int out_stride                    // elements between positions in output
+) {
+    int q_pos = blockIdx.x;
+    int d = threadIdx.x;
+
+    if (q_pos >= seq_len || d >= head_dim) return;
+
+    const float* score_row = scores + (long long)q_pos * total_len;
+
+    float acc = 0.0f;
+    for (int k_pos = 0; k_pos < total_len; k_pos++) {
+        acc += score_row[k_pos] * __half2float(v[(long long)k_pos * kv_stride + d]);
+    }
+    output[(long long)q_pos * out_stride + d] = __float2half(acc);
+}
+
+extern "C" void launch_attention(
+    const void* q,         // [seq_len, ...] one head (strided)
+    const void* k,         // [total_len, ...] one KV head (strided)
+    const void* v,         // [total_len, ...] one KV head (strided)
+    void* output,          // [seq_len, ...] one head (strided)
+    void* scores_buf,      // [seq_len, total_len] f32 scratch
+    int seq_len,
+    int total_len,
+    int head_dim,
+    int q_stride,          // stride between positions in Q/output
+    int kv_stride,         // stride between positions in K/V
+    int out_stride,        // stride between positions in output
+    float scale,
+    void* stream
+) {
+    cudaStream_t s = (cudaStream_t)stream;
+
+    // Step 1: QK^T scores with causal mask
+    int threads1 = (total_len < BLOCK_SIZE) ? total_len : BLOCK_SIZE;
+    attention_scores_kernel<<<seq_len, threads1, 0, s>>>(
+        (const half*)q, (const half*)k, (float*)scores_buf,
+        seq_len, total_len, head_dim, q_stride, kv_stride, scale
+    );
+
+    // Step 2: Softmax
+    int threads2 = (total_len < BLOCK_SIZE) ? total_len : BLOCK_SIZE;
+    int t2 = 1;
+    while (t2 < threads2) t2 <<= 1;
+    if (t2 > BLOCK_SIZE) t2 = BLOCK_SIZE;
+    int smem = t2 * sizeof(float);
+    softmax_kernel<<<seq_len, t2, smem, s>>>(
+        (float*)scores_buf, total_len
+    );
+
+    // Step 3: Score * V
+    int threads3 = (head_dim < BLOCK_SIZE) ? head_dim : BLOCK_SIZE;
+    attention_output_kernel<<<seq_len, threads3, 0, s>>>(
+        (const float*)scores_buf, (const half*)v, (half*)output,
+        seq_len, total_len, head_dim, kv_stride, out_stride
+    );
+}
+
+// ============================================================================
+// Split fused projection output into separate contiguous buffers.
+//
+// After a fused QKV or Gate+Up GEMM, the output is [seq_len, total_dim]
+// with sub-tensors interleaved per row. This kernel deinterleaves into
+// separate contiguous output buffers.
+// ============================================================================
+
+static __global__ void split_fused_kernel(
+    const half* __restrict__ fused,   // [seq_len, total_dim]
+    half* __restrict__ out_a,         // [seq_len, dim_a]
+    half* __restrict__ out_b,         // [seq_len, dim_b]
+    half* __restrict__ out_c,         // [seq_len, dim_c] or NULL for 2-way split
+    int seq_len,
+    int total_dim,
+    int dim_a,
+    int dim_b
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int total = seq_len * total_dim;
+    if (idx >= total) return;
+
+    int pos = idx / total_dim;
+    int d = idx % total_dim;
+    half val = fused[idx];
+
+    if (d < dim_a) {
+        out_a[pos * dim_a + d] = val;
+    } else if (d < dim_a + dim_b) {
+        out_b[pos * dim_b + (d - dim_a)] = val;
+    } else if (out_c) {
+        int dim_c = total_dim - dim_a - dim_b;
+        out_c[pos * dim_c + (d - dim_a - dim_b)] = val;
+    }
+}
+
+extern "C" void launch_split_fused(
+    const void* fused,
+    void* out_a,
+    void* out_b,
+    void* out_c,    // NULL for 2-way split
+    int seq_len,
+    int total_dim,
+    int dim_a,
+    int dim_b,
+    void* stream
+) {
+    int total = seq_len * total_dim;
+    int blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    split_fused_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
+        (const half*)fused,
+        (half*)out_a, (half*)out_b, (half*)out_c,
+        seq_len, total_dim, dim_a, dim_b
+    );
+}
+
+// ============================================================================
+// FlashAttention 2 — fused tiled attention with online softmax
+//
+// Replaces the 3-kernel naive pipeline (scores → softmax → output) with a
+// single kernel that tiles Q into Br-row blocks, iterates K/V in Bc-row
+// blocks, and uses online softmax (running max + sum) so the full
+// [seq_len × total_len] attention matrix is never materialized.
+//
+// Handles GQA natively: multiple Q heads share the same KV head.
+//
+// Grid:  (ceil(seq_len / BR), num_heads)
+// Block: 128 threads
+// Smem:  Q tile (16 KB) + KV tile (16 KB) + score tile (16 KB) = 48 KB
+// ============================================================================
+
+#define FA2_BR 64
+#define FA2_BC 64
+#define FA2_THREADS 128
+#define FA2_MAX_HALF_DIM 128  // head_dim / 2, supports up to head_dim=256
+
+static __global__ void __launch_bounds__(FA2_THREADS, 4)
+flash_attention_2_kernel(
+    const half* __restrict__ Q,   // [seq_len, num_heads * head_dim]
+    const half* __restrict__ K,   // [total_len, num_kv_heads * head_dim]
+    const half* __restrict__ V,   // [total_len, num_kv_heads * head_dim]
+    half* __restrict__ O,         // [seq_len, num_heads * head_dim]
+    int seq_len,
+    int total_len,
+    int num_heads,
+    int num_kv_heads,
+    int head_dim,
+    float scale,
+    int causal
+) {
+    int q_tile_idx = blockIdx.x;
+    int head = blockIdx.y;
+    int tid = threadIdx.x;
+
+    int br_start = q_tile_idx * FA2_BR;
+    int gqa_groups = num_heads / num_kv_heads;
+    int kv_head = head / gqa_groups;
+    int pos_offset = total_len - seq_len;
+
+    // Thread-to-output mapping: 2 threads per Q row, each handles half of head_dim
+    int my_row = tid % FA2_BR;           // 0..63
+    int dim_half = tid / FA2_BR;         // 0 or 1
+    int half_dim = head_dim / 2;         // 64 for hd=128
+    int dim_start = dim_half * half_dim;
+
+    // Global memory strides
+    int q_stride = num_heads * head_dim;
+    int kv_stride = num_kv_heads * head_dim;
+
+    // Shared memory: Q tile | KV tile | Score tile
+    extern __shared__ char smem_raw[];
+    half*  smem_Q  = (half*)smem_raw;
+    half*  smem_KV = smem_Q + FA2_BR * head_dim;
+    float* smem_S  = (float*)(smem_KV + FA2_BC * head_dim);
+
+    // Online softmax state (per-thread, per-row)
+    float m_val = -1e30f;
+    float l_val = 0.0f;
+
+    // Output accumulator — half_dim registers (64 for hd=128)
+    float O_acc[FA2_MAX_HALF_DIM];
+    for (int d = 0; d < half_dim; d++) {
+        O_acc[d] = 0.0f;
+    }
+
+    // ---- Load Q tile (stays resident for all KV iterations) ----
+    {
+        int total_elems = FA2_BR * head_dim;
+        for (int idx = tid; idx < total_elems; idx += FA2_THREADS) {
+            int row = idx / head_dim;
+            int d   = idx % head_dim;
+            int global_row = br_start + row;
+            if (global_row < seq_len) {
+                smem_Q[idx] = Q[(long long)global_row * q_stride + (long long)head * head_dim + d];
+            } else {
+                smem_Q[idx] = __float2half(0.0f);
+            }
+        }
+    }
+    __syncthreads();
+
+    // ---- KV block loop ----
+    int num_kv_blocks = (total_len + FA2_BC - 1) / FA2_BC;
+
+    for (int kv_blk = 0; kv_blk < num_kv_blocks; kv_blk++) {
+        int kv_start = kv_blk * FA2_BC;
+
+        // Causal early exit: all K positions in this block are beyond all Q positions
+        if (causal) {
+            int max_q_abs = br_start + FA2_BR - 1 + pos_offset;
+            if (kv_start > max_q_abs) break;
+        }
+
+        // ---- Load K block ----
+        {
+            int total_elems = FA2_BC * head_dim;
+            for (int idx = tid; idx < total_elems; idx += FA2_THREADS) {
+                int row = idx / head_dim;
+                int d   = idx % head_dim;
+                int global_row = kv_start + row;
+                if (global_row < total_len) {
+                    smem_KV[idx] = K[(long long)global_row * kv_stride + (long long)kv_head * head_dim + d];
+                } else {
+                    smem_KV[idx] = __float2half(0.0f);
+                }
+            }
+        }
+        __syncthreads();
+
+        // ---- S = Q @ K^T * scale (with causal mask) → smem_S ----
+        {
+            int total_s = FA2_BR * FA2_BC;
+            for (int idx = tid; idx < total_s; idx += FA2_THREADS) {
+                int i = idx / FA2_BC;
+                int j = idx % FA2_BC;
+
+                float dot = 0.0f;
+                for (int d = 0; d < head_dim; d++) {
+                    dot += __half2float(smem_Q[i * head_dim + d])
+                         * __half2float(smem_KV[j * head_dim + d]);
+                }
+                dot *= scale;
+
+                // Causal mask: Q at absolute position q_abs can only attend to k_abs <= q_abs
+                if (causal) {
+                    int q_abs = br_start + i + pos_offset;
+                    int k_abs = kv_start + j;
+                    if (k_abs > q_abs) dot = -1e30f;
+                }
+                // Mask out-of-bounds rows
+                if ((br_start + i) >= seq_len || (kv_start + j) >= total_len) {
+                    dot = -1e30f;
+                }
+
+                smem_S[idx] = dot;
+            }
+        }
+        __syncthreads();
+
+        // ---- Online softmax update for my_row ----
+        // Both threads sharing this row (dim_half=0,1) compute identical m/l values
+        float block_max = -1e30f;
+        for (int j = 0; j < FA2_BC; j++) {
+            float s = smem_S[my_row * FA2_BC + j];
+            if (s > block_max) block_max = s;
+        }
+
+        float m_new = fmaxf(m_val, block_max);
+        float alpha = expf(m_val - m_new);
+
+        float block_sum = 0.0f;
+        for (int j = 0; j < FA2_BC; j++) {
+            block_sum += expf(smem_S[my_row * FA2_BC + j] - m_new);
+        }
+
+        l_val = l_val * alpha + block_sum;
+        for (int d = 0; d < half_dim; d++) {
+            O_acc[d] *= alpha;
+        }
+        m_val = m_new;
+
+        __syncthreads();
+
+        // ---- Load V block (reuse smem_KV, K no longer needed) ----
+        {
+            int total_elems = FA2_BC * head_dim;
+            for (int idx = tid; idx < total_elems; idx += FA2_THREADS) {
+                int row = idx / head_dim;
+                int d   = idx % head_dim;
+                int global_row = kv_start + row;
+                if (global_row < total_len) {
+                    smem_KV[idx] = V[(long long)global_row * kv_stride + (long long)kv_head * head_dim + d];
+                } else {
+                    smem_KV[idx] = __float2half(0.0f);
+                }
+            }
+        }
+        __syncthreads();
+
+        // ---- O_acc += P @ V (P recomputed from S and m_val) ----
+        for (int j = 0; j < FA2_BC; j++) {
+            float p_val = expf(smem_S[my_row * FA2_BC + j] - m_val);
+            if (p_val > 1e-10f) {
+                for (int d = 0; d < half_dim; d++) {
+                    O_acc[d] += p_val * __half2float(smem_KV[j * head_dim + dim_start + d]);
+                }
+            }
+        }
+
+        __syncthreads();
+    }
+
+    // ---- Normalize and write output ----
+    int global_row = br_start + my_row;
+    if (global_row < seq_len) {
+        float inv_l = (l_val > 0.0f) ? (1.0f / l_val) : 0.0f;
+        long long out_base = (long long)global_row * q_stride + (long long)head * head_dim;
+        for (int d = 0; d < half_dim; d++) {
+            O[out_base + dim_start + d] = __float2half(O_acc[d] * inv_l);
+        }
+    }
+}
+
+extern "C" void launch_flash_attention(
+    const void* Q,
+    const void* K,
+    const void* V,
+    void* O,
+    int seq_len,
+    int total_len,
+    int num_heads,
+    int num_kv_heads,
+    int head_dim,
+    float scale,
+    int causal,
+    void* stream
+) {
+    int q_tiles = (seq_len + FA2_BR - 1) / FA2_BR;
+    dim3 grid(q_tiles, num_heads);
+    dim3 block(FA2_THREADS);
+
+    int smem_bytes = FA2_BR * head_dim * sizeof(half)     // smem_Q
+                   + FA2_BC * head_dim * sizeof(half)     // smem_KV
+                   + FA2_BR * FA2_BC * sizeof(float);     // smem_S
+
+    flash_attention_2_kernel<<<grid, block, smem_bytes, (cudaStream_t)stream>>>(
+        (const half*)Q, (const half*)K, (const half*)V, (half*)O,
+        seq_len, total_len, num_heads, num_kv_heads, head_dim,
+        scale, causal
+    );
+}
+
+// ============================================================================
+// Fused decode attention (seq_len=1)
+//
+// Optimized for next-token generation. Each warp independently processes a
+// stream of K/V positions — NO __syncthreads in the hot loop.
+//
+// Thread layout (dynamic, adapts to head_dim):
+//   Block size = head_dim threads (must be a multiple of 32)
+//   nwarps = head_dim / 32, dpt = head_dim / 32 (dims per thread)
+//   Each thread handles dpt dimensions: dims [lane*dpt .. lane*dpt+dpt-1]
+//   Warp w processes positions w, w+nwarps, w+2*nwarps, ... (interleaved)
+//
+// Examples: head_dim=128 → 4 warps, 4 dpt. head_dim=160 → 5 warps, 5 dpt.
+//
+// Each warp computes its own partial online-softmax result. After the loop,
+// warps combine via shared memory using the log-sum-exp trick (1 sync).
+//
+// Memory: consecutive threads read consecutive K/V elements → coalesced.
+// Supports head_dim up to 256 (DA_MAX_DPT=8).
+// ============================================================================
+
+// Max dims per thread — supports head_dim up to 512 (512/32 = 16).
+// Gemma 4 global attention layers use head_dim=512.
+#define DA_MAX_DPT 16
+
+static __global__ void decode_fused_attention_kernel(
+    const half* __restrict__ Q,   // [num_heads, head_dim]
+    const half* __restrict__ K,   // [total_len, num_kv_heads, head_dim]
+    const half* __restrict__ V,   // [total_len, num_kv_heads, head_dim]
+    half* __restrict__ O,         // [num_heads, head_dim]
+    int num_heads,
+    int num_kv_heads,
+    int head_dim,
+    int total_len,
+    float scale
+) {
+    const int head = blockIdx.x;
+    const int tid = threadIdx.x;
+    const int warp_id = tid / 32;
+    const int lane = tid & 31;
+
+    // Compute layout from head_dim (block size = head_dim)
+    const int dpt = head_dim / 32;     // dims per thread
+    const int nwarps = head_dim / 32;  // warps per block
+
+    const int gqa_groups = num_heads / num_kv_heads;
+    const int kv_head = head / gqa_groups;
+    const int kv_stride = num_kv_heads * head_dim;
+
+    // Each thread loads its dpt Q dimensions
+    float q[DA_MAX_DPT];
+    for (int i = 0; i < dpt; i++) {
+        q[i] = __half2float(Q[head * head_dim + lane * dpt + i]);
+    }
+
+    // Per-warp online softmax state (registers)
+    float m = -1e30f;
+    float l = 0.0f;
+    float acc[DA_MAX_DPT];
+    for (int i = 0; i < dpt; i++) acc[i] = 0.0f;
+
+    // Hot loop: each warp processes its own position stream
+    for (int pos = warp_id; pos < total_len; pos += nwarps) {
+        long long base = (long long)pos * kv_stride + kv_head * head_dim
+                       + lane * dpt;
+
+        // Dot product: each thread multiplies its dpt dims, warp reduces
+        float dot = 0.0f;
+        for (int i = 0; i < dpt; i++) {
+            dot += q[i] * __half2float(K[base + i]);
+        }
+        // Warp-level reduction (shuffle only — no shared mem, no sync!)
+        #pragma unroll
+        for (int offset = 16; offset > 0; offset >>= 1) {
+            dot += __shfl_xor_sync(0xFFFFFFFF, dot, offset);
+        }
+        float score = dot * scale;  // all lanes have the score
+
+        // Online softmax + V accumulation
+        float new_m = fmaxf(m, score);
+        float correction = expf(m - new_m);
+        float p = expf(score - new_m);
+
+        for (int i = 0; i < dpt; i++) {
+            acc[i] = acc[i] * correction + p * __half2float(V[base + i]);
+        }
+        l = l * correction + p;
+        m = new_m;
+    }
+
+    // === Combine warps' partial results (log-sum-exp trick) ===
+    // Dynamic shared memory layout: [nwarps floats m][nwarps floats l][nwarps*head_dim floats acc]
+    extern __shared__ char da_smem_raw[];
+    float* smem_m   = (float*)da_smem_raw;
+    float* smem_l   = smem_m + nwarps;
+    float* smem_acc = smem_l + nwarps;
+
+    if (lane == 0) {
+        smem_m[warp_id] = m;
+        smem_l[warp_id] = l;
+    }
+    for (int i = 0; i < dpt; i++) {
+        smem_acc[warp_id * head_dim + lane * dpt + i] = acc[i];
+    }
+    __syncthreads();
+
+    // Find global max across warps (all threads compute, it's cheap)
+    float global_m = smem_m[0];
+    for (int w = 1; w < nwarps; w++) {
+        global_m = fmaxf(global_m, smem_m[w]);
+    }
+
+    // Combine each warp's contribution for this thread's dimensions
+    float final_l = 0.0f;
+    float final_acc[DA_MAX_DPT];
+    for (int i = 0; i < dpt; i++) final_acc[i] = 0.0f;
+
+    for (int w = 0; w < nwarps; w++) {
+        float corr = expf(smem_m[w] - global_m);
+        final_l += smem_l[w] * corr;
+        for (int i = 0; i < dpt; i++) {
+            final_acc[i] += smem_acc[w * head_dim + lane * dpt + i] * corr;
+        }
+    }
+
+    // Write output (coalesced: consecutive threads write consecutive dims)
+    float inv_l = 1.0f / final_l;
+    for (int i = 0; i < dpt; i++) {
+        O[head * head_dim + lane * dpt + i] =
+            __float2half(final_acc[i] * inv_l);
+    }
+}
+
+extern "C" void launch_decode_attention(
+    const void* Q,
+    const void* K,
+    const void* V,
+    void* O,
+    int num_heads,
+    int num_kv_heads,
+    int head_dim,
+    int total_len,
+    float scale,
+    void* stream
+) {
+    cudaStream_t s = (cudaStream_t)stream;
+    int nwarps = head_dim / 32;
+    // Shared: nwarps floats (m) + nwarps floats (l) + nwarps*head_dim floats (acc)
+    int smem_bytes = (nwarps * 2 + nwarps * head_dim) * sizeof(float);
+
+    decode_fused_attention_kernel<<<num_heads, head_dim, smem_bytes, s>>>(
+        (const half*)Q, (const half*)K, (const half*)V, (half*)O,
+        num_heads, num_kv_heads, head_dim, total_len, scale
+    );
+}
+
+// ============================================================================
+// P2P allreduce sum (2-GPU NVLink)
+// ============================================================================
+
+static __global__ void allreduce_kernel(
+    half* __restrict__ local_buf,
+    const half* __restrict__ peer_buf,
+    int size
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= size) return;
+    float val = __half2float(local_buf[idx]) + __half2float(peer_buf[idx]);
+    local_buf[idx] = __float2half(val);
+}
+
+extern "C" void launch_allreduce_2gpu(
+    void* local_buf,
+    const void* peer_buf,
+    int num_elements,
+    void* stream
+) {
+    int blocks = (num_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    allreduce_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
+        (half*)local_buf, (const half*)peer_buf, num_elements
+    );
+}
+
+// ============================================================================
+// Zero buffer: output[i] = 0 for all elements
+// Used to clear MoE accumulator before expert dispatch.
+// ============================================================================
+
+static __global__ void zero_kernel(half* __restrict__ output, int total_elements) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_elements) return;
+    output[idx] = __float2half(0.0f);
+}
+
+extern "C" void launch_zero(
+    void* output,
+    int total_elements,
+    void* stream
+) {
+    int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    zero_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
+        (half*)output, total_elements
+    );
+}
+
+// ============================================================================
+// Fused scale-add: output[i] += scale * input[i]
+// Used to accumulate weighted expert outputs in MoE dispatch.
+// Computes in f32 internally for numerical stability.
+// ============================================================================
+
+static __global__ void scale_add_kernel(
+    half* __restrict__ output,
+    const half* __restrict__ input,
+    float scale,
+    int total_elements
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_elements) return;
+    float val = __half2float(output[idx]) + scale * __half2float(input[idx]);
+    output[idx] = __float2half(val);
+}
+
+extern "C" void launch_scale_add(
+    void* output,
+    const void* input,
+    float scale,
+    int total_elements,
+    void* stream
+) {
+    int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    scale_add_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
+        (half*)output, (const half*)input, scale, total_elements
+    );
+}
+
+// ============================================================================
+// Gather rows: dst[i, :] = src[indices[i], :] for i in 0..num_rows
+// Used to batch tokens assigned to the same expert into a contiguous buffer
+// for efficient GEMM during MoE prefill.
+// ============================================================================
+
+static __global__ void gather_rows_kernel(
+    const half* __restrict__ src,
+    half* __restrict__ dst,
+    const int* __restrict__ indices,
+    int num_rows,
+    int row_dim
+) {
+    int row = blockIdx.x;
+    if (row >= num_rows) return;
+    int src_row = indices[row];
+    int tid = threadIdx.x;
+    for (int d = tid; d < row_dim; d += blockDim.x) {
+        dst[(long long)row * row_dim + d] = src[(long long)src_row * row_dim + d];
+    }
+}
+
+extern "C" void launch_gather_rows(
+    const void* src,
+    void* dst,
+    const int* indices,
+    int num_rows,
+    int row_dim,
+    void* stream
+) {
+    if (num_rows == 0) return;
+    gather_rows_kernel<<<num_rows, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
+        (const half*)src, (half*)dst, indices, num_rows, row_dim
+    );
+}
+
+// ============================================================================
+// Weighted scatter-add: dst[indices[i], :] += weights[i] * src[i, :]
+// Used to accumulate weighted expert outputs back to their token positions
+// in mlp_out during MoE prefill. Each expert's scatter runs sequentially
+// on the same stream, so no write races between experts.
+// ============================================================================
+
+static __global__ void weighted_scatter_add_kernel(
+    half* __restrict__ dst,
+    const half* __restrict__ src,
+    const int* __restrict__ indices,
+    const float* __restrict__ weights,
+    int num_rows,
+    int row_dim
+) {
+    int row = blockIdx.x;
+    if (row >= num_rows) return;
+    int dst_row = indices[row];
+    float w = weights[row];
+    int tid = threadIdx.x;
+    for (int d = tid; d < row_dim; d += blockDim.x) {
+        float val = __half2float(dst[(long long)dst_row * row_dim + d])
+                  + w * __half2float(src[(long long)row * row_dim + d]);
+        dst[(long long)dst_row * row_dim + d] = __float2half(val);
+    }
+}
+
+extern "C" void launch_weighted_scatter_add(
+    void* dst,
+    const void* src,
+    const int* indices,
+    const float* weights,
+    int num_rows,
+    int row_dim,
+    void* stream
+) {
+    if (num_rows == 0) return;
+    weighted_scatter_add_kernel<<<num_rows, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
+        (half*)dst, (const half*)src, indices, weights, num_rows, row_dim
+    );
+}
+
+// ============================================================================
+// GeGLU: output = gelu_tanh(gate) * up   (Gemma 4 activation)
+//
+// GELU with tanh approximation:
+//   gelu(x) = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
+// Combined with gated unit: output = gelu(gate) * up
+// ============================================================================
+
+static __global__ void geglu_kernel(
+    const half* __restrict__ gate,
+    const half* __restrict__ up,
+    half* __restrict__ output,
+    int total_elements
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_elements) return;
+
+    float g = __half2float(gate[idx]);
+    float u = __half2float(up[idx]);
+
+    // GELU tanh approximation: 0.5 * g * (1 + tanh(sqrt(2/pi) * (g + 0.044715 * g^3)))
+    const float SQRT_2_OVER_PI = 0.7978845608f;  // sqrt(2.0 / pi)
+    float inner = SQRT_2_OVER_PI * (g + 0.044715f * g * g * g);
+    float gelu_g = 0.5f * g * (1.0f + tanhf(inner));
+
+    output[idx] = __float2half(gelu_g * u);
+}
+
+extern "C" void launch_geglu(
+    const void* gate,
+    const void* up,
+    void* output,
+    int total_elements,
+    void* stream
+) {
+    int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    geglu_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
+        (const half*)gate, (const half*)up, (half*)output, total_elements
+    );
+}
+
+// ============================================================================
+// In-place scale: data[i] *= scale
+//
+// Used for:
+//   - Embedding scaling (sqrt(hidden_size)) after lookup
+//   - Per-layer scalar multiplication
+// ============================================================================
+
+static __global__ void scale_inplace_kernel(
+    half* __restrict__ data,
+    float scale,
+    int total_elements
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_elements) return;
+    data[idx] = __float2half(__half2float(data[idx]) * scale);
+}
+
+extern "C" void launch_scale_inplace(
+    void* data,
+    float scale,
+    int total_elements,
+    void* stream
+) {
+    int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    scale_inplace_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
+        (half*)data, scale, total_elements
+    );
+}
+
+// ============================================================================
+// Logit soft-cap: logits[i] = cap * tanh(logits[i] / cap)
+//
+// Clamps logits to [-cap, +cap] range. Gemma 4 uses cap=30.0.
+// ============================================================================
+
+static __global__ void logit_softcap_kernel(
+    half* __restrict__ logits,
+    float cap,
+    int total_elements
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total_elements) return;
+    float val = __half2float(logits[idx]);
+    logits[idx] = __float2half(cap * tanhf(val / cap));
+}
+
+extern "C" void launch_logit_softcap(
+    void* logits,
+    float cap,
+    int total_elements,
+    void* stream
+) {
+    int blocks = (total_elements + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    logit_softcap_kernel<<<blocks, BLOCK_SIZE, 0, (cudaStream_t)stream>>>(
+        (half*)logits, cap, total_elements
+    );
+}
+
+// ============================================================================
+// Scale-free RMSNorm (no learnable weight):
+//   output = input * rsqrt(mean(input^2) + eps)
+//
+// Used for Gemma 4 V-norm (value states) and router input normalization.
+// Same reduction as rmsnorm_kernel, but without the weight multiply.
+// ============================================================================
+
+static __global__ void rmsnorm_noscale_kernel(
+    const half* __restrict__ input,
+    half* __restrict__ output,
+    int dim,
+    float eps
+) {
+    int row = blockIdx.x;
+    int tid = threadIdx.x;
+    int stride = blockDim.x;
+
+    const half* x = input + (long long)row * dim;
+    half* out = output + (long long)row * dim;
+
+    extern __shared__ float sdata[];
+
+    float local_sum = 0.0f;
+    for (int i = tid; i < dim; i += stride) {
+        float val = __half2float(x[i]);
+        local_sum += val * val;
+    }
+    sdata[tid] = local_sum;
+    __syncthreads();
+
+    for (int s = stride / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            sdata[tid] += sdata[tid + s];
+        }
+        __syncthreads();
+    }
+
+    float rms = rsqrtf(sdata[0] / (float)dim + eps);
+
+    for (int i = tid; i < dim; i += stride) {
+        out[i] = __float2half(__half2float(x[i]) * rms);
+    }
+}
+
+extern "C" void launch_rmsnorm_noscale(
+    const void* input,
+    void* output,
+    int batch_size,
+    int dim,
+    float eps,
+    void* stream
+) {
+    int threads = (dim < BLOCK_SIZE) ? dim : BLOCK_SIZE;
+    int smem = threads * sizeof(float);
+    rmsnorm_noscale_kernel<<<batch_size, threads, smem, (cudaStream_t)stream>>>(
+        (const half*)input, (half*)output, dim, eps
+    );
+}
+
+// ============================================================================
+// Partial RoPE: Apply rotary embeddings to only the first rotary_dim
+// dimensions of each head, leaving the remaining dimensions unchanged.
+//
+// Gemma 4 global attention layers use partial_rotary_factor=0.25, meaning
+// only 128 out of 512 head dimensions get rotary encoding.
+//
+// Uses the same rotary-half convention as launch_rope:
+//   pairs are (d[i], d[i + half_rotary]) for i in 0..half_rotary
+// ============================================================================
+
+static __global__ void rope_partial_kernel(
+    half* __restrict__ qk,      // [seq_len, num_heads, head_dim]
+    int num_heads,
+    int head_dim,
+    int rotary_dim,             // first rotary_dim dims get RoPE
+    int pos_offset,
+    float theta_base
+) {
+    int seq_idx = blockIdx.x;
+    int head = blockIdx.y;
+    int tid = threadIdx.x;
+    int half_rotary = rotary_dim / 2;
+
+    if (tid >= half_rotary || head >= num_heads) return;
+
+    int pos = seq_idx + pos_offset;
+    float freq = 1.0f / powf(theta_base, (float)(2 * tid) / (float)rotary_dim);
+    float angle = (float)pos * freq;
+    float cos_val = cosf(angle);
+    float sin_val = sinf(angle);
+
+    long long base_idx = ((long long)seq_idx * num_heads + head) * head_dim;
+    float v0 = __half2float(qk[base_idx + tid]);
+    float v1 = __half2float(qk[base_idx + tid + half_rotary]);
+    qk[base_idx + tid]               = __float2half(v0 * cos_val - v1 * sin_val);
+    qk[base_idx + tid + half_rotary] = __float2half(v0 * sin_val + v1 * cos_val);
+    // Dimensions [rotary_dim, head_dim) are untouched
+}
+
+extern "C" void launch_rope_partial(
+    void* q,
+    void* k,
+    int seq_len,
+    int num_heads,
+    int num_kv_heads,
+    int head_dim,
+    int rotary_dim,
+    int pos_offset,
+    float theta_base,
+    void* stream
+) {
+    int threads = rotary_dim / 2;
+    if (threads > BLOCK_SIZE) threads = BLOCK_SIZE;
+    dim3 grid_q(seq_len, num_heads);
+    dim3 grid_k(seq_len, num_kv_heads);
+
+    rope_partial_kernel<<<grid_q, threads, 0, (cudaStream_t)stream>>>(
+        (half*)q, num_heads, head_dim, rotary_dim, pos_offset, theta_base
+    );
+    rope_partial_kernel<<<grid_k, threads, 0, (cudaStream_t)stream>>>(
+        (half*)k, num_kv_heads, head_dim, rotary_dim, pos_offset, theta_base
+    );
+}
diff --git a/crates/weaver-spu/src/encoder/jina_v4.rs b/crates/weaver-spu/src/encoder/jina_v4.rs
index dfcc053b..39d0c1c7 100644
--- a/crates/weaver-spu/src/encoder/jina_v4.rs
+++ b/crates/weaver-spu/src/encoder/jina_v4.rs
@@ -50,6 +50,23 @@ use super::qwen25vl_loraed::{Config as Qwen2Config, LoraQwen2Model};
 /// fail closed rather than silently mis-dispatch.
 const ADAPTER_TASKS: &[&str] = &["retrieval", "text-matching", "code"];
 
+/// Jina V4's *trained* context window. Distinct from the snapshot's
+/// `config.json::max_position_embeddings` (= 128_000), which reflects
+/// the underlying Qwen2.5-VL architecture's RoPE extension. Jina V4
+/// was trained on 32K-token sequences; using sequences longer than
+/// this is OOD and produces low-quality embeddings — the model card
+/// and the upstream Python service both report this as the
+/// operator-facing max.
+///
+/// We cap `max_seq_len` to this constant rather than the snapshot
+/// architecture max so that:
+///  - `EmbedderInfo::max_seq_length` matches the existing Python
+///    service's report (cohort-pin guard stays consistent across
+///    backends; the migration doesn't require a pin reset);
+///  - `encode_text` rejects oversized inputs at the API edge with a
+///    clear error rather than silently producing degraded vectors.
+pub const JINA_V4_TRAINED_CONTEXT: usize = 32_768;
+
 /// Jina V4's 4-string user-facing task surface.
 ///
 /// This is what the embedder API takes; it's translated
@@ -244,7 +261,9 @@ impl JinaV4Embedder {
             .with_context(|| format!("read {}", config_path.display()))?;
         let raw: JinaV4ConfigRaw =
             serde_json::from_str(&config_json).context("parse Jina V4 config.json")?;
-        let max_seq_len = raw.max_position_embeddings;
+        // Cap to Jina V4's trained context, not the architecture max.
+        // See `JINA_V4_TRAINED_CONTEXT` for the rationale.
+        let max_seq_len = JINA_V4_TRAINED_CONTEXT.min(raw.max_position_embeddings);
         let cfg = raw.into_qwen2_config()?;
 
         // Read adapter_config.json so the LoRA hyperparameters
@@ -368,8 +387,8 @@ impl JinaV4Embedder {
         // user-visible semantics.
         if ids.len() > self.max_seq_len {
             bail!(
-                "input is {} tokens after prefix; exceeds Jina V4's max_position_embeddings of {}. \
-                 Caller must split or truncate before encoding.",
+                "input is {} tokens after prefix; exceeds Jina V4's trained context \
+                 (max_seq_length = {}). Caller must split or truncate before encoding.",
                 ids.len(),
                 self.max_seq_len
             );
@@ -443,9 +462,18 @@ impl JinaV4Embedder {
         self.model.dtype()
     }
 
-    /// Architectural sequence-length ceiling read from the snapshot's
-    /// `config.json` (`max_position_embeddings`). Used by the
-    /// `EmbedderClient` to populate `EmbedderInfo::max_seq_length`.
+    /// Effective sequence-length ceiling for this embedder.
+    ///
+    /// Returns `min(JINA_V4_TRAINED_CONTEXT, raw.max_position_embeddings)`
+    /// — i.e., the cap from `from_snapshot()`, not the raw architectural
+    /// ceiling from `config.json::max_position_embeddings`. On the
+    /// canonical 853c867 snapshot the architecture max is 128_000 but
+    /// Jina V4 was trained on 32_768; the cap returns the trained value
+    /// so this matches what the Python embedder service reports and
+    /// what `encode_text()` will accept without bailing.
+    ///
+    /// Used by `EmbedderClient` to populate
+    /// `EmbedderInfo::max_seq_length`.
     pub fn max_seq_len(&self) -> usize {
         self.max_seq_len
     }
diff --git a/docs/infrastructure/embedder-rust-cutover-evidence/smoke-2026-05-08.md b/docs/infrastructure/embedder-rust-cutover-evidence/smoke-2026-05-08.md
new file mode 100644
index 00000000..b6fd76b7
--- /dev/null
+++ b/docs/infrastructure/embedder-rust-cutover-evidence/smoke-2026-05-08.md
@@ -0,0 +1,108 @@
+# In-process Rust embedder — end-to-end smoke test (2026-05-08)
+
+Phase 1 task #117. First end-to-end verification that the in-process
+`EmbedderClient` constructed by `weaver serve` against a real Jina V4
+snapshot boots cleanly through the cohort-pin probe and the daemon
+listens.
+
+## Setup
+
+- **Hardware**: GPU 0 (RTX A6000, 49 GB) — GPU 2 off-limits this
+  session per concurrent work; GPU 0/1 reserved for our smoke runs.
+- **Snapshot**:
+  `/opt/weaver/huggingface/hub/models--jinaai--jina-embeddings-v4/snapshots/853c867b65b749f3c3c72a06868140d842e04f06/`
+  (the canonical revision the existing pin records).
+- **Existing pin**: `/opt/weaver/state/embedder.pin.json` — written
+  2026-04-20 by the Python embedder service, model `jinaai/jina-embeddings-v4`,
+  dim=2048, max_seq=32768, rev=853c867….
+- **Daemon binary**: `target/release/weaver` (241 MB),
+  `cargo build -p weaver-interface --features inference,embedder-rust --release`.
+- **Smoke config**: `/tmp/weaver-smoke-rust.toml`. `[embedder].backend = "rust"`,
+  `snapshot = "/opt/weaver/.../snapshots/853c867.../"`, GPU 0 locked.
+  Socket on `/tmp/` to avoid permissions clash with the (currently
+  inactive) systemd unit's `/run/weaver/` paths.
+
+## Pre-flight fixes that this run required
+
+1. **Restore `kernels/transformer.cu`** (1266 lines). PR-0.5.E (#278)
+   deleted the file when consolidating `weaver-inference` into
+   `weaver-spu` but never ported it across. Without it, the cudarc
+   decoder path's FFI declarations (`launch_allreduce_2gpu`, etc.) are
+   unresolved at link time. Restored from `git show 79bb649^:crates/weaver-inference/kernels/transformer.cu`
+   into `crates/weaver-spu/kernels/transformer.cu`.
+
+2. **Revert `build.rs` absent-file fallback** added in PR #296. The
+   fallback skipped the cuda compile when the kernel was missing; with
+   the file restored, the fallback is dead code and the warning is
+   misleading.
+
+3. **Cap `max_seq_len` to Jina V4's trained context** in
+   `encoder/jina_v4.rs`. The snapshot's `config.json::max_position_embeddings`
+   reports 128000 (the Qwen2.5-VL architecture's RoPE-extended max),
+   but Jina V4 was trained on 32K — going beyond is OOD and produces
+   degraded embeddings. The Python service correctly reports 32768.
+   First smoke run failed on this exact identity drift:
+   `pin=32768 live=128000`. Added `JINA_V4_TRAINED_CONTEXT = 32_768`
+   constant; `max_seq_len` derivation is now
+   `JINA_V4_TRAINED_CONTEXT.min(raw.max_position_embeddings)`.
+
+## Run
+
+```text
+$ timeout 60 ./target/release/weaver serve --config /tmp/weaver-smoke-rust.toml
+
+WARN [embedder].gpu is deprecated post-B′.3b; encoder placement is now per-agent
+     via spu.encoder.gpu. ... gpu=0
+Constructing in-process Rust embedder from
+  /opt/weaver/huggingface/hub/models--jinaai--jina-embeddings-v4/snapshots/853c867.../
+  on cuda:0 bf16...
+Embedder ready (in-process) in 1.2s: jinaai/jina-embeddings-v4
+  (dim=2048, max_seq=32768, rev=853c867b65b749f3c3c72a06868140d842e04f06)
+GPU policies: GPU 0=locked
+Inference server starting (socket=/tmp/weaver-smoke-inference.sock, tcp=127.0.0.1:18080)
+Models loaded on demand via 'weaver model load' or agent pre-flight.
+Terminated     ← SIGTERM at 60s timeout (exit 143)
+```
+
+## What this confirms
+
+| Step | Status |
+|------|:------:|
+| Config parse + `MultiModelConfig::validate()` accepts `backend = "rust"` + `snapshot` | ✓ |
+| `[embedder].gpu` deprecation warning logs | ✓ |
+| Backend probe runs **before** `build_server_state()` (PR #296 review fix) | ✓ |
+| `EmbedderClient::from_snapshot(...)` constructs in 1.2 s on cuda:0 bf16 | ✓ |
+| `info()` returns model_name + dim 2048 + max_seq 32768 + revision basename | ✓ |
+| Cohort-pin probe matches existing `/opt/weaver/state/embedder.pin.json` | ✓ |
+| `build_server_state()` succeeds (empty model registry, no load) | ✓ |
+| Daemon listens on Unix socket + TCP listener | ✓ |
+| Daemon survives until SIGTERM | ✓ |
+
+## What this does **not** confirm
+
+- Actual embed requests against the daemon — the smoke config has no
+  `[[models]]` and the daemon's only embedder use today is the boot-time
+  pin probe. End-to-end embed exercises happen via task #118
+  (agent-side runtime cutover) where consumers actually hold an
+  `Arc<dyn Embedder>`.
+- Dual-backend fallback semantics — only the `backend = "rust"` path
+  was exercised tonight. The `backend = "python"` path remains
+  validated by previous production runs but not by this smoke.
+- Long-running stability — only a 60s bounded run. No load, no
+  long-context inputs, no memory pressure.
+
+## Files
+
+- `/tmp/weaver-smoke-rust.toml` — smoke config (committed-tree
+  shouldn't carry it; lives in `/tmp` intentionally).
+- `/tmp/weaver-smoke-rust-v2.log` — captured output, reproduced inline above.
+
+## Follow-ups
+
+- #118 agent-side runtime cutover — replace
+  `EmbeddingClient::connect_default()` in `crates/weaver-demo/src/herobench/benchmark.rs`
+  with the new in-process `EmbedderClient`.
+- Operator-facing migration note: when systemd `weaver-infer.service`
+  flips to `--features embedder-rust` and `backend = "rust"`, the
+  existing pin file's `max_seq_length=32768` will keep matching
+  thanks to the `JINA_V4_TRAINED_CONTEXT` cap. No pin reset required.