From 381c3d299c7e89112a2436b7b6377f4c8be15cc9 Mon Sep 17 00:00:00 2001
From: Saba Fallah <sabafallah@gmail.com>
Date: Wed, 24 Jun 2026 07:55:22 +0200
Subject: [PATCH 1/4] mtmd: model: unlimited-ocr: converter + parity test

---
 conversion/__init__.py                |  3 ++
 conversion/deepseek.py                | 12 +++++-
 tools/mtmd/tests/test-deepseek-ocr.py | 54 ++++++++++++++++++++++++---
 3 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/conversion/__init__.py b/conversion/__init__.py
index 00192cf33a8c..2b6b7a4338fc 100644
--- a/conversion/__init__.py
+++ b/conversion/__init__.py
@@ -46,6 +46,7 @@
     "DbrxForCausalLM": "dbrx",
     "DeciLMForCausalLM": "deci",
     "DeepseekForCausalLM": "deepseek",
+    "DeepseekOCRForCausalLM": "deepseek",
     "DeepseekV2ForCausalLM": "deepseek",
     "DeepseekV3ForCausalLM": "deepseek",
     "DeepseekV32ForCausalLM": "deepseek",
@@ -231,6 +232,7 @@
     "UMT5ForConditionalGeneration": "t5",
     "UMT5Model": "t5",
     "UltravoxModel": "ultravox",
+    "UnlimitedOCRForCausalLM": "deepseek",
     "VLlama3ForCausalLM": "llama",
     "VoxtralForConditionalGeneration": "llama",
     "WavTokenizerDec": "wavtokenizer",
@@ -296,6 +298,7 @@
     "StepVLForConditionalGeneration": "step3",
     "Step3p7ForConditionalGeneration": "step3",
     "UltravoxModel": "ultravox",
+    "UnlimitedOCRForCausalLM": "deepseek",
     "VoxtralForConditionalGeneration": "ultravox",
     "YoutuVLForConditionalGeneration": "youtuvl",
 }
diff --git a/conversion/deepseek.py b/conversion/deepseek.py
index 72520cc9f6a5..4c93fb66df64 100644
--- a/conversion/deepseek.py
+++ b/conversion/deepseek.py
@@ -14,7 +14,7 @@
 from .qwen import QwenModel
 
 
-@ModelBase.register("DeepseekOCRForCausalLM")
+@ModelBase.register("DeepseekOCRForCausalLM", "UnlimitedOCRForCausalLM")
 class DeepseekOCRVisionModel(MmprojModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -205,6 +205,8 @@ def prepare_tensors(self):
 @ModelBase.register(
     "DeepseekV2ForCausalLM",
     "DeepseekV3ForCausalLM",
+    "DeepseekOCRForCausalLM",
+    "UnlimitedOCRForCausalLM",
     "KimiVLForConditionalGeneration",
     "KimiK25ForConditionalGeneration",
     "YoutuForCausalLM",
@@ -224,7 +226,7 @@ def __init__(self, *args, **kwargs):
         self.origin_hf_arch = hparams.get('architectures', [None])[0]
 
         # special handling for Deepseek OCR
-        if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
+        if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM", "UnlimitedOCRForCausalLM"):
             self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
             self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
             self.gguf_writer.add_architecture()
@@ -350,6 +352,12 @@ def set_gguf_parameters(self):
 
         self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
 
+        # Unlimited-OCR sliding window; written for metadata, the decoder ignores it (full MHA)
+        if is_ocr:
+            sliding_window = hparams.get("sliding_window_size") or hparams.get("sliding_window")
+            if sliding_window:
+                self.gguf_writer.add_sliding_window(sliding_window)
+
         if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
             # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
             # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py
index 5f5fef765a62..ec0b4523be9a 100644
--- a/tools/mtmd/tests/test-deepseek-ocr.py
+++ b/tools/mtmd/tests/test-deepseek-ocr.py
@@ -9,6 +9,7 @@
 
 import argparse
 import logging
+import re
 import subprocess
 import sys
 import unicodedata
@@ -28,6 +29,12 @@ class ModelSpec:
     mmproj_arg: str
     model_default: str
     mmproj_default: str
+    prompt: str = "Free OCR. "
+    n_predict: int = 512
+    n_ctx: int | None = None
+    # Unlimited-OCR's "document parsing" prompt emits <|det|> grounding markup that
+    # the HF reference strips in result.md; drop it before scoring to match.
+    strip_grounding: bool = False
 
 
 @dataclass
@@ -63,6 +70,20 @@ def chrf_min(self) -> float:
         model_default="gguf_models/deepseek-ai/deepseek-ocr-2-bf16.gguf",
         mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-2-bf16.gguf",
     ),
+    "unlimited": ModelSpec(
+        key="unlimited", label="Unlimited-OCR",
+        model_arg="--llama-model-unlimited", mmproj_arg="--mmproj-unlimited",
+        model_default="gguf_models/baidu/unlimited-ocr-bf16.gguf",
+        mmproj_default="gguf_models/baidu/mmproj-unlimited-ocr-bf16.gguf",
+        # "Free OCR." immediately emits EOS on this checkpoint; the HF reference
+        # (demo/unlimited_ocr_scores.py) uses "document parsing.", which grounds.
+        prompt="document parsing.",
+        # Grounding emits ~3x the tokens of plain OCR, so it needs a larger budget
+        # and context to reach the article body the ground truth covers.
+        n_predict=4096,
+        n_ctx=16384,
+        strip_grounding=True,
+    ),
 }
 
 CASES = [
@@ -82,9 +103,26 @@ def chrf_min(self) -> float:
         # is one pixel off and lands at ~0.69 instead.
         hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0,
     ),
+    TestCase(
+        model_key="unlimited", label="single-view scan",
+        image="tools/mtmd/test-1.jpeg",
+        ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
+        # HF reference: Unlimited-OCR scoring (gundam, bf16) on this image/ground-truth.
+        # Decoder runs full MHA, not R-SWA; the band absorbs that gap + bf16 variance.
+        hf_cer=0.1869, hf_chrf=75.23, cer_tol=0.06, chrf_tol=6.0,
+    ),
 ]
 
 
+GROUNDING_TAG_RE = re.compile(r"<\|(ref|det)\|>.*?<\|/\1\|>", re.DOTALL)
+
+
+def strip_grounding(text: str) -> str:
+    """Drop <|ref|>..<|/ref|> / <|det|>..<|/det|> grounding markup, matching the
+    cleaned result.md the HF reference scores against."""
+    return GROUNDING_TAG_RE.sub("", text)
+
+
 def arg_dest(flag: str) -> str:
     return flag.lstrip("-").replace("-", "_")
 
@@ -129,19 +167,19 @@ def compute_chrf(expected: str, ocr_out: str) -> float:
     return CHRF().sentence_score(ocr_out, [expected]).score
 
 
-def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
+def run_mtmd_cli(spec: "ModelSpec", model_path, mmproj_path, image_path, bin_path) -> str:
     """Run mtmd-cli on the image and return its output."""
     cmd = [
         str(bin_path),
         "-m", str(model_path),
         "--mmproj", str(mmproj_path),
         "--image", str(image_path),
-        "-p", "Free OCR. ",
+        "-p", spec.prompt,
         "--chat-template", "deepseek-ocr",
         "--temp", "0",
         "--flash-attn", "off",  # match the HF "eager" attention reference
         "--no-warmup",
-        "-n", "512",  # cap loops on hard images (KV would otherwise fill)
+        "-n", str(spec.n_predict),  # cap loops on hard images (KV would otherwise fill)
         # HF decodes with no_repeat_ngram_size; llama.cpp's analog is DRY.
         # Default DRY breakers include "\n", so they are cleared below.
         "--dry-multiplier", "0.8",
@@ -150,6 +188,8 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
         "--dry-penalty-last-n", "-1",
         "--dry-sequence-breaker", "none",
     ]
+    if spec.n_ctx is not None:
+        cmd += ["-c", str(spec.n_ctx)]
     logger.debug(f"  command: {' '.join(cmd)}")
 
     try:
@@ -164,6 +204,8 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
         raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}")
 
     output = result.stdout.decode("utf-8", errors="replace").strip()
+    if spec.strip_grounding:
+        output = strip_grounding(output)
     if not output:
         raise RuntimeError("llama-mtmd-cli produced no output on stdout")
     logger.info(f"  output: {len(output)} chars")
@@ -193,7 +235,7 @@ def evaluate(case: "TestCase", expected: str, ocr_out: str) -> bool:
 
     logger.info("")
     logger.info("=" * 60)
-    logger.info("Free OCR evaluation:")
+    logger.info("OCR evaluation:")
     logger.info("=" * 60)
     logger.info(f"  CER               {cer:>7.4f}    (HF {case.hf_cer:.4f}, <= {case.cer_max:>7.4f}  -> {verdict(cer_pass)})")
     logger.info(f"  chrF (0-100)      {chrf:>7.2f}    (HF {case.hf_chrf:.2f}, >= {case.chrf_min:>7.2f}  -> {verdict(chrf_pass)})")
@@ -269,9 +311,9 @@ def main() -> int:
         expected = read_expected_text(ground_truth)
         logger.info(f"  Image: {case.image}")
         logger.info(f"  Expected text: {len(expected)} chars")
-        logger.info("  Running llama.cpp 'Free OCR'")
+        logger.info(f"  Running llama.cpp prompt {model_spec.prompt!r}")
         try:
-            ocr_out = run_mtmd_cli(model, mmproj, image, binary)
+            ocr_out = run_mtmd_cli(model_spec, model, mmproj, image, binary)
         except RuntimeError as e:
             logger.error(f"  Error: {e}")
             results[title] = False

From 823dfaac262e4663c59fcb044effbf50fccf6e72 Mon Sep 17 00:00:00 2001
From: Saba Fallah <sabafallah@gmail.com>
Date: Wed, 24 Jun 2026 08:23:33 +0200
Subject: [PATCH 2/4] deepseek2-ocr: R-SWA reference sliding window attention

---
 conversion/deepseek.py                |  2 +-
 src/llama-graph.cpp                   |  5 +++-
 src/llama-hparams.h                   | 14 ++++++++++-
 src/llama-kv-cache.cpp                | 35 ++++++++++++++++++++++++++-
 src/llama-kv-cache.h                  |  4 +++
 src/llama-model.cpp                   | 22 +++++++++++++++++
 src/models/deepseek2ocr.cpp           |  6 +++++
 tools/mtmd/tests/test-deepseek-ocr.py |  8 +++---
 8 files changed, 89 insertions(+), 7 deletions(-)

diff --git a/conversion/deepseek.py b/conversion/deepseek.py
index 4c93fb66df64..28d280ba7ddb 100644
--- a/conversion/deepseek.py
+++ b/conversion/deepseek.py
@@ -352,7 +352,7 @@ def set_gguf_parameters(self):
 
         self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
 
-        # Unlimited-OCR sliding window; written for metadata, the decoder ignores it (full MHA)
+        # Unlimited-OCR R-SWA sliding window; the deepseek2-ocr decoder reads it
         if is_ocr:
             sliding_window = hparams.get("sliding_window_size") or hparams.get("sliding_window")
             if sliding_window:
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 68c9e606c3e3..2a8fce032242 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -395,6 +395,7 @@ static void print_mask(const T * data, int64_t n_tokens, int64_t n_kv, int64_t n
         case LLAMA_SWA_TYPE_STANDARD:  swa_type_str = "LLAMA_SWA_TYPE_STANDARD"; break;
         case LLAMA_SWA_TYPE_CHUNKED:   swa_type_str = "LLAMA_SWA_TYPE_CHUNKED"; break;
         case LLAMA_SWA_TYPE_SYMMETRIC: swa_type_str = "LLAMA_SWA_TYPE_SYMMETRIC"; break;
+        case LLAMA_SWA_TYPE_REFERENCE: swa_type_str = "LLAMA_SWA_TYPE_REFERENCE"; break;
     };
 
     LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swa_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str);
@@ -2285,7 +2286,9 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
     auto inp = std::make_unique<llm_graph_input_attn_kv>(hparams, cparams, mctx_cur);
 
     {
-        GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA");
+        // REFERENCE masks within this single cache; other SWA types need iswa
+        GGML_ASSERT((hparams.swa_type == LLAMA_SWA_TYPE_NONE ||
+                     hparams.swa_type == LLAMA_SWA_TYPE_REFERENCE) && "Use llama_kv_cache_iswa for SWA");
 
         inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
         inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 2eadeb214811..63474de780ec 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -21,6 +21,7 @@ enum llama_swa_type {
     LLAMA_SWA_TYPE_STANDARD  = 1,
     LLAMA_SWA_TYPE_CHUNKED   = 2,
     LLAMA_SWA_TYPE_SYMMETRIC = 3,
+    LLAMA_SWA_TYPE_REFERENCE = 4, // R-SWA: always-visible prefix + window over the rest
 };
 
 // forward declaration; full definition in llama-graph.h
@@ -357,7 +358,8 @@ struct llama_hparams {
     // note: inlined on purpose for performance reasons
     // TODO: think of a better place for this function
     // TODO: pack the SWA params in a struct?
-    static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) {
+    // n_ref = R-SWA prefix length L_m (always-visible positions); < 0 = unlatched, full causal
+    static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1, llama_pos n_ref = -1) {
         assert(p0 >= 0 && p1 >= 0);
 
         switch (swa_type) {
@@ -388,6 +390,16 @@ struct llama_hparams {
                         return true;
                     }
                 } break;
+            case LLAMA_SWA_TYPE_REFERENCE:
+                {
+                    // visible iff in the prefix (p0 < n_ref) or within the window (p1 - p0 < n_swa)
+                    const bool windowed  = p1 - p0 >= (int32_t) n_swa;
+                    const bool in_prefix = n_ref < 0 || p0 < n_ref;
+
+                    if (windowed && !in_prefix) {
+                        return true;
+                    }
+                } break;
         }
 
         return false;
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index 2802103bdd82..a3130f251342 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -100,6 +100,8 @@ llama_kv_cache::llama_kv_cache(
     v_cells_impl(other ? other->v_cells_impl : std::make_shared<llama_kv_cells_vec>()),
     v_cells(*v_cells_impl) {
 
+    n_ref.fill(-1);
+
     // shared cells view the source cache's K/V tensors, so the cell count
     // follows the source allocation: a fitted target can be smaller than the
     // draft default and oversized views would overflow the source tensors
@@ -377,6 +379,8 @@ llama_kv_cache::llama_kv_cache(
 }
 
 void llama_kv_cache::clear(bool data) {
+    n_ref.fill(-1);
+
     for (uint32_t s = 0; s < n_stream; ++s) {
         v_cells[s].reset();
         v_heads[s] = 0;
@@ -405,6 +409,15 @@ bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
         p1 = std::numeric_limits<llama_pos>::max();
     }
 
+    // dropping from pos 0 invalidates the latched prefix
+    if (p0 == 0) {
+        if (seq_id >= 0) {
+            n_ref[seq_id] = -1;
+        } else {
+            n_ref.fill(-1);
+        }
+    }
+
     if (seq_id >= 0) {
         auto & cells = v_cells[seq_to_stream[seq_id]];
         auto & head  = v_heads[seq_to_stream[seq_id]];
@@ -1109,6 +1122,22 @@ void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch &
         return;
     }
 
+    // latch L_m at the prefill->decode boundary (first single-token append to a populated
+    // seq); until then the mask is full causal. assumes single-token decode (mtmd-cli/server).
+    if (swa_type == LLAMA_SWA_TYPE_REFERENCE) {
+        uint32_t n_tok_seq[LLAMA_MAX_SEQ] = { 0 };
+        for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+            n_tok_seq[ubatch.seq_id[i][0]]++;
+        }
+        for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+            const llama_seq_id seq_id = ubatch.seq_id[i][0];
+            if (n_ref[seq_id] < 0 && n_tok_seq[seq_id] == 1 &&
+                v_cells[seq_to_stream[seq_id]].seq_pos_max(seq_id) >= 0) {
+                n_ref[seq_id] = ubatch.pos[i];
+            }
+        }
+    }
+
     // keep track of the max sequence position that we would overwrite with this ubatch
     // for non-SWA cache, this would be always empty
     llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ];
@@ -1519,6 +1548,9 @@ struct args_set_input_kq_mask {
     uint32_t       n_swa;
     llama_swa_type swa_type;
 
+    // per-seq R-SWA prefix length L_m (-1 = unlatched), indexed by seq_id
+    const llama_pos * n_ref;
+
     int64_t n_kv;
     int64_t n_stream;
     int64_t n_tps;
@@ -1654,7 +1686,7 @@ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data
 
                 // apply SWA if any
                 if (swa) {
-                    if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) {
+                    if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1, args.n_ref[seq_id])) {
                         goto skip;
                     }
                 }
@@ -1734,6 +1766,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
         /*.seq_to_stream    =*/ seq_to_stream,
         /*.n_swa            =*/ n_swa,
         /*.swa_type         =*/ swa_type,
+        /*.n_ref            =*/ n_ref.data(),
         /*.n_kv             =*/ n_kv,
         /*.n_stream         =*/ n_stream,
         /*.n_tps            =*/ n_tps,
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 3d68f98c1424..89edc0adb1a8 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -259,6 +259,10 @@ class llama_kv_cache : public llama_memory_i {
     // this is the SWA type of the cache - not to be confused with the model SWA type
     const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
 
+    // R-SWA per-seq prefix length L_m (-1 = unlatched -> full causal mask); latched at the
+    // prefill->decode boundary in apply_ubatch, read by set_input_kq_mask.
+    std::array<llama_pos, LLAMA_MAX_SEQ> n_ref;
+
     // ggml contexts for the KV cache along with the allocated backend buffers:
     std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> ctxs_bufs;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index d041a9ce3e27..9cc5d1aa27e5 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2023,6 +2023,28 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
             {
                 res = nullptr;
             } break;
+        case LLM_ARCH_DEEPSEEK2OCR:
+            {
+                // R-SWA runs on one full cache - the REFERENCE mask keeps the
+                // prefix visible, so no eviction and no iswa.
+                res = new llama_kv_cache(
+                        *this,
+                        hparams,
+                        params.type_k,
+                        params.type_v,
+                        !cparams.flash_attn,
+                        cparams.offload_kqv,
+                        cparams.kv_unified,
+                        cparams.n_ctx_seq,
+                        cparams.n_seq_max,
+                        1,
+                        hparams.n_swa,
+                        hparams.swa_type,
+                        nullptr,
+                        nullptr,
+                        nullptr,
+                        nullptr);
+            } break;
         case LLM_ARCH_DEEPSEEK32:
             {
                 res = new llama_kv_cache_dsa(
diff --git a/src/models/deepseek2ocr.cpp b/src/models/deepseek2ocr.cpp
index 65d31c31b93e..4b5f622f9afc 100644
--- a/src/models/deepseek2ocr.cpp
+++ b/src/models/deepseek2ocr.cpp
@@ -14,6 +14,12 @@ void llama_model_deepseek2ocr::load_arch_hparams(llama_model_loader & ml) {
         hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
     }
 
+    // Unlimited-OCR sets sliding_window -> R-SWA
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+    if (hparams.n_swa > 0) {
+        hparams.swa_type = LLAMA_SWA_TYPE_REFERENCE;
+    }
+
     switch (hparams.n_layer()) {
         case 12: type = LLM_TYPE_3B; break;
         default: type = LLM_TYPE_UNKNOWN;
diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py
index ec0b4523be9a..f9adf2c7bcf8 100644
--- a/tools/mtmd/tests/test-deepseek-ocr.py
+++ b/tools/mtmd/tests/test-deepseek-ocr.py
@@ -107,9 +107,11 @@ def chrf_min(self) -> float:
         model_key="unlimited", label="single-view scan",
         image="tools/mtmd/test-1.jpeg",
         ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
-        # HF reference: Unlimited-OCR scoring (gundam, bf16) on this image/ground-truth.
-        # Decoder runs full MHA, not R-SWA; the band absorbs that gap + bf16 variance.
-        hf_cer=0.1869, hf_chrf=75.23, cer_tol=0.06, chrf_tol=6.0,
+        # HF reference: demo/unlimited_ocr_scores.py (gundam, bf16) in the
+        # Unlimited-OCR-Universal fork, on this same image/ground-truth.
+        # llama.cpp runs R-SWA (n_swa>0 -> LLAMA_SWA_TYPE_REFERENCE) in the decoder,
+        # matching the regime the weights were trained under.
+        hf_cer=0.1869, hf_chrf=75.23, cer_tol=0.04, chrf_tol=4.0,
     ),
 ]
 

From cc6e8e0963238578a40262412f9ccdf57b2598d4 Mon Sep 17 00:00:00 2001
From: Saba Fallah <sabafallah@gmail.com>
Date: Wed, 24 Jun 2026 09:03:10 +0200
Subject: [PATCH 3/4] deepseek2-ocr: maintain R-SWA n_ref across kv-cache
 lifecycle ops

---
 src/llama-kv-cache.cpp | 64 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index a3130f251342..bdf7de83fcc5 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -479,6 +479,9 @@ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, ll
     GGML_ASSERT(seq_id_src >= 0 && (size_t) seq_id_src < seq_to_stream.size());
     GGML_ASSERT(seq_id_dst >= 0 && (size_t) seq_id_dst < seq_to_stream.size());
 
+    // copy inherits the latched prefix
+    n_ref[seq_id_dst] = n_ref[seq_id_src];
+
     const auto s0 = seq_to_stream[seq_id_src];
     const auto s1 = seq_to_stream[seq_id_dst];
 
@@ -570,6 +573,13 @@ void llama_kv_cache::seq_keep(llama_seq_id seq_id) {
 
     GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
 
+    // other seqs are purged -> drop their latched prefix
+    for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+        if ((llama_seq_id) s != seq_id) {
+            n_ref[s] = -1;
+        }
+    }
+
     auto & cells = v_cells[seq_to_stream[seq_id]];
     auto & head  = v_heads[seq_to_stream[seq_id]];
 
@@ -620,6 +630,11 @@ void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, ll
         return;
     }
 
+    // the prefix boundary is an absolute pos -> shift it with its cells
+    if (n_ref[seq_id] >= 0 && n_ref[seq_id] >= p0 && n_ref[seq_id] < p1) {
+        n_ref[seq_id] += shift;
+    }
+
     for (uint32_t i = 0; i < cells.size(); ++i) {
         if (!cells.pos_in(i, p0, p1)) {
             continue;
@@ -667,6 +682,11 @@ void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, in
         return;
     }
 
+    // the prefix boundary is an absolute pos -> divide it with its cells
+    if (n_ref[seq_id] >= 0 && n_ref[seq_id] >= p0 && n_ref[seq_id] < p1) {
+        n_ref[seq_id] /= d;
+    }
+
     for (uint32_t i = 0; i < cells.size(); ++i) {
         if (!cells.pos_in(i, p0, p1)) {
             continue;
@@ -1993,6 +2013,20 @@ void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, lla
 
     io.write(&n_stream, sizeof(n_stream));
 
+    // persist n_ref; REFERENCE-guarded so other cache types' state format is unchanged.
+    // whole-cache case is count-prefixed to tolerate a different n_seq_max on restore.
+    if (swa_type == LLAMA_SWA_TYPE_REFERENCE) {
+        if (seq_id == -1) {
+            const uint32_t n_ref_count = n_seq_max;
+            io.write(&n_ref_count, sizeof(n_ref_count));
+            for (uint32_t i = 0; i < n_ref_count; ++i) {
+                io.write(&n_ref[i], sizeof(llama_pos));
+            }
+        } else {
+            io.write(&n_ref[seq_id], sizeof(llama_pos));
+        }
+    }
+
     for (uint32_t s = 0; s < n_stream; ++s) {
         cell_ranges_t cr { s, {} };
 
@@ -2069,6 +2103,25 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama
         throw std::runtime_error("n_stream mismatch");
     }
 
+    // read n_ref now but apply after the restore below; clear()/seq_rm() would reset it
+    std::array<llama_pos, LLAMA_MAX_SEQ> n_ref_restored;
+    n_ref_restored.fill(-1);
+    if (swa_type == LLAMA_SWA_TYPE_REFERENCE) {
+        if (seq_id == -1) {
+            uint32_t n_ref_count = 0;
+            io.read(&n_ref_count, sizeof(n_ref_count));
+            for (uint32_t i = 0; i < n_ref_count; ++i) {
+                llama_pos v;
+                io.read(&v, sizeof(v));
+                if (i < n_seq_max) {
+                    n_ref_restored[i] = v;
+                }
+            }
+        } else {
+            io.read(&n_ref_restored[seq_id], sizeof(llama_pos));
+        }
+    }
+
     for (uint32_t s = 0; s < n_stream; ++s) {
         uint32_t cell_count;
         io.read(&cell_count, sizeof(cell_count));
@@ -2094,6 +2147,17 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama
             throw std::runtime_error("failed to restore kv cache");
         }
     }
+
+    // cells restored -> reinstate n_ref
+    if (swa_type == LLAMA_SWA_TYPE_REFERENCE) {
+        if (seq_id == -1) {
+            for (uint32_t i = 0; i < n_seq_max; ++i) {
+                n_ref[i] = n_ref_restored[i];
+            }
+        } else {
+            n_ref[seq_id] = n_ref_restored[seq_id];
+        }
+    }
 }
 
 void llama_kv_cache::state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id) const {

From 649864fc6ba2fee8b7800b819808b94ef609dc9e Mon Sep 17 00:00:00 2001
From: Saba Fallah <sabafallah@gmail.com>
Date: Sat, 27 Jun 2026 13:20:04 +0200
Subject: [PATCH 4/4] deepseek2-ocr: default the V KV-cache to F32 to fix
 garbled dense tables

The DeepSeek-OCR / Unlimited-OCR decoder reads dense layout (e.g. tables)
by attending over the always-visible visual prefix. With the default F16
V-cache, those value vectors are truncated enough to garble the output:
table headers come out as "&quot;" / "&gt;" (reported on #24975), while the
official HF reference parses them correctly.

The HF reference accumulates attention in F32, so match it by promoting the
F16 V-cache default to F32 for LLM_ARCH_DEEPSEEK2OCR. An explicit
lower-precision -ctv (e.g. q8_0) is still honored. This is the in-graph
equivalent of running with --cache-type-v f32. It is not the cuBLAS compute
mode: the headers are emitted deep in autoregressive decode (a mat-vec path
that bypasses cuBLAS), so FORCE_CUBLAS_COMPUTE_32F has no effect; F16 V
storage/accumulation is what truncates.

Verified on the reported image (parses cleanly with no flags) and with
tools/mtmd/tests/test-deepseek-ocr.py: all cases pass and improve, no
regression (v1 0.2626, v2 0.6877, unlimited 0.1641 CER).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_015dykwunMpwXWxHPVhbjhiK
---
 src/llama-model.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 9cc5d1aa27e5..2e7b206dd298 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2027,11 +2027,20 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
             {
                 // R-SWA runs on one full cache - the REFERENCE mask keeps the
                 // prefix visible, so no eviction and no iswa.
+                //
+                // The V cache must be F32. This OCR decoder reads dense layout
+                // (e.g. tables) by attending over the always-visible visual
+                // prefix; an F16 V cache truncates those value vectors enough to
+                // garble the output (table headers come out as "&quot;"). The HF
+                // reference accumulates attention in F32, so match it here. We
+                // promote the F16 default to F32 by default while still honoring
+                // an explicit lower-precision -ctv (e.g. q8_0). See PR #24975.
+                const ggml_type type_v = params.type_v == GGML_TYPE_F16 ? GGML_TYPE_F32 : params.type_v;
                 res = new llama_kv_cache(
                         *this,
                         hparams,
                         params.type_k,
-                        params.type_v,
+                        type_v,
                         !cparams.flash_attn,
                         cparams.offload_kqv,
                         cparams.kv_unified,