From ef43d819bce3d0df24051221185b17f3790fe677 Mon Sep 17 00:00:00 2001
From: julio <julio@luxmarketing.agency>
Date: Tue, 23 Jun 2026 16:32:17 -0400
Subject: [PATCH] server : create context checkpoint on slot restore

A restored slot had no context checkpoint, so the next request with
cache_prompt found no reuse anchor and reprocessed the entire restored
prefix (cache_n=0), defeating the purpose of /slots restore. Create a
checkpoint spanning the restored span so the restored KV is reused.

Tested on a transformer model (gemma-4-12B, 15924-token prefix): a
restore followed by an identical prompt went from prompt_n=15924/15924
(full reprocess) to prompt_n=1/15924 (reused).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tools/server/server-context.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 39b7eb218e69..ecdf18fa1a06 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2597,6 +2597,16 @@ struct server_context_impl {
                     slot->prompt.tokens.clear();
                     slot->prompt.tokens.insert(tokens);
 
+                    // A restored slot has no context checkpoint, so the next
+                    // request with cache_prompt finds no reuse anchor and
+                    // reprocesses the entire restored prefix. Create a
+                    // checkpoint spanning the restored span so the restored
+                    // KV is actually reused.
+                    if (params_base.n_ctx_checkpoints > 0) {
+                        create_checkpoint(*slot, (int64_t) 0, 0,
+                                          (llama_pos) (token_count > 0 ? token_count - 1 : 0));
+                    }
+
                     const int64_t t_end = ggml_time_us();
                     const double t_restore_ms = (t_end - t_start) / 1000.0;