diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 39b7eb218e69..ecdf18fa1a06 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2597,6 +2597,16 @@ struct server_context_impl { slot->prompt.tokens.clear(); slot->prompt.tokens.insert(tokens); + // A restored slot has no context checkpoint, so the next + // request with cache_prompt finds no reuse anchor and + // reprocesses the entire restored prefix. Create a + // checkpoint spanning the restored span so the restored + // KV is actually reused. + if (params_base.n_ctx_checkpoints > 0) { + create_checkpoint(*slot, (int64_t) 0, 0, + (llama_pos) (token_count > 0 ? token_count - 1 : 0)); + } + const int64_t t_end = ggml_time_us(); const double t_restore_ms = (t_end - t_start) / 1000.0;