From 381c3d299c7e89112a2436b7b6377f4c8be15cc9 Mon Sep 17 00:00:00 2001 From: Saba Fallah Date: Wed, 24 Jun 2026 07:55:22 +0200 Subject: [PATCH 1/4] mtmd: model: unlimited-ocr: converter + parity test --- conversion/__init__.py | 3 ++ conversion/deepseek.py | 12 +++++- tools/mtmd/tests/test-deepseek-ocr.py | 54 ++++++++++++++++++++++++--- 3 files changed, 61 insertions(+), 8 deletions(-) diff --git a/conversion/__init__.py b/conversion/__init__.py index 00192cf33a8c..2b6b7a4338fc 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -46,6 +46,7 @@ "DbrxForCausalLM": "dbrx", "DeciLMForCausalLM": "deci", "DeepseekForCausalLM": "deepseek", + "DeepseekOCRForCausalLM": "deepseek", "DeepseekV2ForCausalLM": "deepseek", "DeepseekV3ForCausalLM": "deepseek", "DeepseekV32ForCausalLM": "deepseek", @@ -231,6 +232,7 @@ "UMT5ForConditionalGeneration": "t5", "UMT5Model": "t5", "UltravoxModel": "ultravox", + "UnlimitedOCRForCausalLM": "deepseek", "VLlama3ForCausalLM": "llama", "VoxtralForConditionalGeneration": "llama", "WavTokenizerDec": "wavtokenizer", @@ -296,6 +298,7 @@ "StepVLForConditionalGeneration": "step3", "Step3p7ForConditionalGeneration": "step3", "UltravoxModel": "ultravox", + "UnlimitedOCRForCausalLM": "deepseek", "VoxtralForConditionalGeneration": "ultravox", "YoutuVLForConditionalGeneration": "youtuvl", } diff --git a/conversion/deepseek.py b/conversion/deepseek.py index 72520cc9f6a5..4c93fb66df64 100644 --- a/conversion/deepseek.py +++ b/conversion/deepseek.py @@ -14,7 +14,7 @@ from .qwen import QwenModel -@ModelBase.register("DeepseekOCRForCausalLM") +@ModelBase.register("DeepseekOCRForCausalLM", "UnlimitedOCRForCausalLM") class DeepseekOCRVisionModel(MmprojModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -205,6 +205,8 @@ def prepare_tensors(self): @ModelBase.register( "DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM", + "DeepseekOCRForCausalLM", + "UnlimitedOCRForCausalLM", "KimiVLForConditionalGeneration", "KimiK25ForConditionalGeneration", "YoutuForCausalLM", @@ -224,7 +226,7 @@ def __init__(self, *args, **kwargs): self.origin_hf_arch = hparams.get('architectures', [None])[0] # special handling for Deepseek OCR - if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"): + if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM", "UnlimitedOCRForCausalLM"): self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] self.gguf_writer.add_architecture() @@ -350,6 +352,12 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + # Unlimited-OCR sliding window; written for metadata, the decoder ignores it (full MHA) + if is_ocr: + sliding_window = hparams.get("sliding_window_size") or hparams.get("sliding_window") + if sliding_window: + self.gguf_writer.add_sliding_window(sliding_window) + if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None: # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py index 5f5fef765a62..ec0b4523be9a 100644 --- a/tools/mtmd/tests/test-deepseek-ocr.py +++ b/tools/mtmd/tests/test-deepseek-ocr.py @@ -9,6 +9,7 @@ import argparse import logging +import re import subprocess import sys import unicodedata @@ -28,6 +29,12 @@ class ModelSpec: mmproj_arg: str model_default: str mmproj_default: str + prompt: str = "Free OCR. " + n_predict: int = 512 + n_ctx: int | None = None + # Unlimited-OCR's "document parsing" prompt emits <|det|> grounding markup that + # the HF reference strips in result.md; drop it before scoring to match. + strip_grounding: bool = False @dataclass @@ -63,6 +70,20 @@ def chrf_min(self) -> float: model_default="gguf_models/deepseek-ai/deepseek-ocr-2-bf16.gguf", mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-2-bf16.gguf", ), + "unlimited": ModelSpec( + key="unlimited", label="Unlimited-OCR", + model_arg="--llama-model-unlimited", mmproj_arg="--mmproj-unlimited", + model_default="gguf_models/baidu/unlimited-ocr-bf16.gguf", + mmproj_default="gguf_models/baidu/mmproj-unlimited-ocr-bf16.gguf", + # "Free OCR." immediately emits EOS on this checkpoint; the HF reference + # (demo/unlimited_ocr_scores.py) uses "document parsing.", which grounds. + prompt="document parsing.", + # Grounding emits ~3x the tokens of plain OCR, so it needs a larger budget + # and context to reach the article body the ground truth covers. + n_predict=4096, + n_ctx=16384, + strip_grounding=True, + ), } CASES = [ @@ -82,9 +103,26 @@ def chrf_min(self) -> float: # is one pixel off and lands at ~0.69 instead. hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0, ), + TestCase( + model_key="unlimited", label="single-view scan", + image="tools/mtmd/test-1.jpeg", + ground_truth="tools/mtmd/tests/test-1-ground-truth.txt", + # HF reference: Unlimited-OCR scoring (gundam, bf16) on this image/ground-truth. + # Decoder runs full MHA, not R-SWA; the band absorbs that gap + bf16 variance. + hf_cer=0.1869, hf_chrf=75.23, cer_tol=0.06, chrf_tol=6.0, + ), ] +GROUNDING_TAG_RE = re.compile(r"<\|(ref|det)\|>.*?<\|/\1\|>", re.DOTALL) + + +def strip_grounding(text: str) -> str: + """Drop <|ref|>..<|/ref|> / <|det|>..<|/det|> grounding markup, matching the + cleaned result.md the HF reference scores against.""" + return GROUNDING_TAG_RE.sub("", text) + + def arg_dest(flag: str) -> str: return flag.lstrip("-").replace("-", "_") @@ -129,19 +167,19 @@ def compute_chrf(expected: str, ocr_out: str) -> float: return CHRF().sentence_score(ocr_out, [expected]).score -def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str: +def run_mtmd_cli(spec: "ModelSpec", model_path, mmproj_path, image_path, bin_path) -> str: """Run mtmd-cli on the image and return its output.""" cmd = [ str(bin_path), "-m", str(model_path), "--mmproj", str(mmproj_path), "--image", str(image_path), - "-p", "Free OCR. ", + "-p", spec.prompt, "--chat-template", "deepseek-ocr", "--temp", "0", "--flash-attn", "off", # match the HF "eager" attention reference "--no-warmup", - "-n", "512", # cap loops on hard images (KV would otherwise fill) + "-n", str(spec.n_predict), # cap loops on hard images (KV would otherwise fill) # HF decodes with no_repeat_ngram_size; llama.cpp's analog is DRY. # Default DRY breakers include "\n", so they are cleared below. "--dry-multiplier", "0.8", @@ -150,6 +188,8 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str: "--dry-penalty-last-n", "-1", "--dry-sequence-breaker", "none", ] + if spec.n_ctx is not None: + cmd += ["-c", str(spec.n_ctx)] logger.debug(f" command: {' '.join(cmd)}") try: @@ -164,6 +204,8 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str: raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}") output = result.stdout.decode("utf-8", errors="replace").strip() + if spec.strip_grounding: + output = strip_grounding(output) if not output: raise RuntimeError("llama-mtmd-cli produced no output on stdout") logger.info(f" output: {len(output)} chars") @@ -193,7 +235,7 @@ def evaluate(case: "TestCase", expected: str, ocr_out: str) -> bool: logger.info("") logger.info("=" * 60) - logger.info("Free OCR evaluation:") + logger.info("OCR evaluation:") logger.info("=" * 60) logger.info(f" CER {cer:>7.4f} (HF {case.hf_cer:.4f}, <= {case.cer_max:>7.4f} -> {verdict(cer_pass)})") logger.info(f" chrF (0-100) {chrf:>7.2f} (HF {case.hf_chrf:.2f}, >= {case.chrf_min:>7.2f} -> {verdict(chrf_pass)})") @@ -269,9 +311,9 @@ def main() -> int: expected = read_expected_text(ground_truth) logger.info(f" Image: {case.image}") logger.info(f" Expected text: {len(expected)} chars") - logger.info(" Running llama.cpp 'Free OCR'") + logger.info(f" Running llama.cpp prompt {model_spec.prompt!r}") try: - ocr_out = run_mtmd_cli(model, mmproj, image, binary) + ocr_out = run_mtmd_cli(model_spec, model, mmproj, image, binary) except RuntimeError as e: logger.error(f" Error: {e}") results[title] = False From 823dfaac262e4663c59fcb044effbf50fccf6e72 Mon Sep 17 00:00:00 2001 From: Saba Fallah Date: Wed, 24 Jun 2026 08:23:33 +0200 Subject: [PATCH 2/4] deepseek2-ocr: R-SWA reference sliding window attention --- conversion/deepseek.py | 2 +- src/llama-graph.cpp | 5 +++- src/llama-hparams.h | 14 ++++++++++- src/llama-kv-cache.cpp | 35 ++++++++++++++++++++++++++- src/llama-kv-cache.h | 4 +++ src/llama-model.cpp | 22 +++++++++++++++++ src/models/deepseek2ocr.cpp | 6 +++++ tools/mtmd/tests/test-deepseek-ocr.py | 8 +++--- 8 files changed, 89 insertions(+), 7 deletions(-) diff --git a/conversion/deepseek.py b/conversion/deepseek.py index 4c93fb66df64..28d280ba7ddb 100644 --- a/conversion/deepseek.py +++ b/conversion/deepseek.py @@ -352,7 +352,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) - # Unlimited-OCR sliding window; written for metadata, the decoder ignores it (full MHA) + # Unlimited-OCR R-SWA sliding window; the deepseek2-ocr decoder reads it if is_ocr: sliding_window = hparams.get("sliding_window_size") or hparams.get("sliding_window") if sliding_window: diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 68c9e606c3e3..2a8fce032242 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -395,6 +395,7 @@ static void print_mask(const T * data, int64_t n_tokens, int64_t n_kv, int64_t n case LLAMA_SWA_TYPE_STANDARD: swa_type_str = "LLAMA_SWA_TYPE_STANDARD"; break; case LLAMA_SWA_TYPE_CHUNKED: swa_type_str = "LLAMA_SWA_TYPE_CHUNKED"; break; case LLAMA_SWA_TYPE_SYMMETRIC: swa_type_str = "LLAMA_SWA_TYPE_SYMMETRIC"; break; + case LLAMA_SWA_TYPE_REFERENCE: swa_type_str = "LLAMA_SWA_TYPE_REFERENCE"; break; }; LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swa_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str); @@ -2285,7 +2286,9 @@ static std::unique_ptr build_attn_inp_kv_impl( auto inp = std::make_unique(hparams, cparams, mctx_cur); { - GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "Use llama_kv_cache_iswa for SWA"); + // REFERENCE masks within this single cache; other SWA types need iswa + GGML_ASSERT((hparams.swa_type == LLAMA_SWA_TYPE_NONE || + hparams.swa_type == LLAMA_SWA_TYPE_REFERENCE) && "Use llama_kv_cache_iswa for SWA"); inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch); inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch); diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 2eadeb214811..63474de780ec 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -21,6 +21,7 @@ enum llama_swa_type { LLAMA_SWA_TYPE_STANDARD = 1, LLAMA_SWA_TYPE_CHUNKED = 2, LLAMA_SWA_TYPE_SYMMETRIC = 3, + LLAMA_SWA_TYPE_REFERENCE = 4, // R-SWA: always-visible prefix + window over the rest }; // forward declaration; full definition in llama-graph.h @@ -357,7 +358,8 @@ struct llama_hparams { // note: inlined on purpose for performance reasons // TODO: think of a better place for this function // TODO: pack the SWA params in a struct? - static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1) { + // n_ref = R-SWA prefix length L_m (always-visible positions); < 0 = unlatched, full causal + static bool is_masked_swa(uint32_t n_swa, llama_swa_type swa_type, llama_pos p0, llama_pos p1, llama_pos n_ref = -1) { assert(p0 >= 0 && p1 >= 0); switch (swa_type) { @@ -388,6 +390,16 @@ struct llama_hparams { return true; } } break; + case LLAMA_SWA_TYPE_REFERENCE: + { + // visible iff in the prefix (p0 < n_ref) or within the window (p1 - p0 < n_swa) + const bool windowed = p1 - p0 >= (int32_t) n_swa; + const bool in_prefix = n_ref < 0 || p0 < n_ref; + + if (windowed && !in_prefix) { + return true; + } + } break; } return false; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 2802103bdd82..a3130f251342 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -100,6 +100,8 @@ llama_kv_cache::llama_kv_cache( v_cells_impl(other ? other->v_cells_impl : std::make_shared()), v_cells(*v_cells_impl) { + n_ref.fill(-1); + // shared cells view the source cache's K/V tensors, so the cell count // follows the source allocation: a fitted target can be smaller than the // draft default and oversized views would overflow the source tensors @@ -377,6 +379,8 @@ llama_kv_cache::llama_kv_cache( } void llama_kv_cache::clear(bool data) { + n_ref.fill(-1); + for (uint32_t s = 0; s < n_stream; ++s) { v_cells[s].reset(); v_heads[s] = 0; @@ -405,6 +409,15 @@ bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { p1 = std::numeric_limits::max(); } + // dropping from pos 0 invalidates the latched prefix + if (p0 == 0) { + if (seq_id >= 0) { + n_ref[seq_id] = -1; + } else { + n_ref.fill(-1); + } + } + if (seq_id >= 0) { auto & cells = v_cells[seq_to_stream[seq_id]]; auto & head = v_heads[seq_to_stream[seq_id]]; @@ -1109,6 +1122,22 @@ void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch & return; } + // latch L_m at the prefill->decode boundary (first single-token append to a populated + // seq); until then the mask is full causal. assumes single-token decode (mtmd-cli/server). + if (swa_type == LLAMA_SWA_TYPE_REFERENCE) { + uint32_t n_tok_seq[LLAMA_MAX_SEQ] = { 0 }; + for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { + n_tok_seq[ubatch.seq_id[i][0]]++; + } + for (uint32_t i = 0; i < ubatch.n_tokens; ++i) { + const llama_seq_id seq_id = ubatch.seq_id[i][0]; + if (n_ref[seq_id] < 0 && n_tok_seq[seq_id] == 1 && + v_cells[seq_to_stream[seq_id]].seq_pos_max(seq_id) >= 0) { + n_ref[seq_id] = ubatch.pos[i]; + } + } + } + // keep track of the max sequence position that we would overwrite with this ubatch // for non-SWA cache, this would be always empty llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ]; @@ -1519,6 +1548,9 @@ struct args_set_input_kq_mask { uint32_t n_swa; llama_swa_type swa_type; + // per-seq R-SWA prefix length L_m (-1 = unlatched), indexed by seq_id + const llama_pos * n_ref; + int64_t n_kv; int64_t n_stream; int64_t n_tps; @@ -1654,7 +1686,7 @@ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, T * data // apply SWA if any if (swa) { - if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1)) { + if (llama_hparams::is_masked_swa(n_swa, swa_type, p0, p1, args.n_ref[seq_id])) { goto skip; } } @@ -1734,6 +1766,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u /*.seq_to_stream =*/ seq_to_stream, /*.n_swa =*/ n_swa, /*.swa_type =*/ swa_type, + /*.n_ref =*/ n_ref.data(), /*.n_kv =*/ n_kv, /*.n_stream =*/ n_stream, /*.n_tps =*/ n_tps, diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 3d68f98c1424..89edc0adb1a8 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -259,6 +259,10 @@ class llama_kv_cache : public llama_memory_i { // this is the SWA type of the cache - not to be confused with the model SWA type const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; + // R-SWA per-seq prefix length L_m (-1 = unlatched -> full causal mask); latched at the + // prefill->decode boundary in apply_ubatch, read by set_input_kq_mask. + std::array n_ref; + // ggml contexts for the KV cache along with the allocated backend buffers: std::vector> ctxs_bufs; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d041a9ce3e27..9cc5d1aa27e5 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2023,6 +2023,28 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, { res = nullptr; } break; + case LLM_ARCH_DEEPSEEK2OCR: + { + // R-SWA runs on one full cache - the REFERENCE mask keeps the + // prefix visible, so no eviction and no iswa. + res = new llama_kv_cache( + *this, + hparams, + params.type_k, + params.type_v, + !cparams.flash_attn, + cparams.offload_kqv, + cparams.kv_unified, + cparams.n_ctx_seq, + cparams.n_seq_max, + 1, + hparams.n_swa, + hparams.swa_type, + nullptr, + nullptr, + nullptr, + nullptr); + } break; case LLM_ARCH_DEEPSEEK32: { res = new llama_kv_cache_dsa( diff --git a/src/models/deepseek2ocr.cpp b/src/models/deepseek2ocr.cpp index 65d31c31b93e..4b5f622f9afc 100644 --- a/src/models/deepseek2ocr.cpp +++ b/src/models/deepseek2ocr.cpp @@ -14,6 +14,12 @@ void llama_model_deepseek2ocr::load_arch_hparams(llama_model_loader & ml) { hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX; } + // Unlimited-OCR sets sliding_window -> R-SWA + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); + if (hparams.n_swa > 0) { + hparams.swa_type = LLAMA_SWA_TYPE_REFERENCE; + } + switch (hparams.n_layer()) { case 12: type = LLM_TYPE_3B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py index ec0b4523be9a..f9adf2c7bcf8 100644 --- a/tools/mtmd/tests/test-deepseek-ocr.py +++ b/tools/mtmd/tests/test-deepseek-ocr.py @@ -107,9 +107,11 @@ def chrf_min(self) -> float: model_key="unlimited", label="single-view scan", image="tools/mtmd/test-1.jpeg", ground_truth="tools/mtmd/tests/test-1-ground-truth.txt", - # HF reference: Unlimited-OCR scoring (gundam, bf16) on this image/ground-truth. - # Decoder runs full MHA, not R-SWA; the band absorbs that gap + bf16 variance. - hf_cer=0.1869, hf_chrf=75.23, cer_tol=0.06, chrf_tol=6.0, + # HF reference: demo/unlimited_ocr_scores.py (gundam, bf16) in the + # Unlimited-OCR-Universal fork, on this same image/ground-truth. + # llama.cpp runs R-SWA (n_swa>0 -> LLAMA_SWA_TYPE_REFERENCE) in the decoder, + # matching the regime the weights were trained under. + hf_cer=0.1869, hf_chrf=75.23, cer_tol=0.04, chrf_tol=4.0, ), ] From cc6e8e0963238578a40262412f9ccdf57b2598d4 Mon Sep 17 00:00:00 2001 From: Saba Fallah Date: Wed, 24 Jun 2026 09:03:10 +0200 Subject: [PATCH 3/4] deepseek2-ocr: maintain R-SWA n_ref across kv-cache lifecycle ops --- src/llama-kv-cache.cpp | 64 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index a3130f251342..bdf7de83fcc5 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -479,6 +479,9 @@ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, ll GGML_ASSERT(seq_id_src >= 0 && (size_t) seq_id_src < seq_to_stream.size()); GGML_ASSERT(seq_id_dst >= 0 && (size_t) seq_id_dst < seq_to_stream.size()); + // copy inherits the latched prefix + n_ref[seq_id_dst] = n_ref[seq_id_src]; + const auto s0 = seq_to_stream[seq_id_src]; const auto s1 = seq_to_stream[seq_id_dst]; @@ -570,6 +573,13 @@ void llama_kv_cache::seq_keep(llama_seq_id seq_id) { GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); + // other seqs are purged -> drop their latched prefix + for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) { + if ((llama_seq_id) s != seq_id) { + n_ref[s] = -1; + } + } + auto & cells = v_cells[seq_to_stream[seq_id]]; auto & head = v_heads[seq_to_stream[seq_id]]; @@ -620,6 +630,11 @@ void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, ll return; } + // the prefix boundary is an absolute pos -> shift it with its cells + if (n_ref[seq_id] >= 0 && n_ref[seq_id] >= p0 && n_ref[seq_id] < p1) { + n_ref[seq_id] += shift; + } + for (uint32_t i = 0; i < cells.size(); ++i) { if (!cells.pos_in(i, p0, p1)) { continue; @@ -667,6 +682,11 @@ void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, in return; } + // the prefix boundary is an absolute pos -> divide it with its cells + if (n_ref[seq_id] >= 0 && n_ref[seq_id] >= p0 && n_ref[seq_id] < p1) { + n_ref[seq_id] /= d; + } + for (uint32_t i = 0; i < cells.size(); ++i) { if (!cells.pos_in(i, p0, p1)) { continue; @@ -1993,6 +2013,20 @@ void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, lla io.write(&n_stream, sizeof(n_stream)); + // persist n_ref; REFERENCE-guarded so other cache types' state format is unchanged. + // whole-cache case is count-prefixed to tolerate a different n_seq_max on restore. + if (swa_type == LLAMA_SWA_TYPE_REFERENCE) { + if (seq_id == -1) { + const uint32_t n_ref_count = n_seq_max; + io.write(&n_ref_count, sizeof(n_ref_count)); + for (uint32_t i = 0; i < n_ref_count; ++i) { + io.write(&n_ref[i], sizeof(llama_pos)); + } + } else { + io.write(&n_ref[seq_id], sizeof(llama_pos)); + } + } + for (uint32_t s = 0; s < n_stream; ++s) { cell_ranges_t cr { s, {} }; @@ -2069,6 +2103,25 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama throw std::runtime_error("n_stream mismatch"); } + // read n_ref now but apply after the restore below; clear()/seq_rm() would reset it + std::array n_ref_restored; + n_ref_restored.fill(-1); + if (swa_type == LLAMA_SWA_TYPE_REFERENCE) { + if (seq_id == -1) { + uint32_t n_ref_count = 0; + io.read(&n_ref_count, sizeof(n_ref_count)); + for (uint32_t i = 0; i < n_ref_count; ++i) { + llama_pos v; + io.read(&v, sizeof(v)); + if (i < n_seq_max) { + n_ref_restored[i] = v; + } + } + } else { + io.read(&n_ref_restored[seq_id], sizeof(llama_pos)); + } + } + for (uint32_t s = 0; s < n_stream; ++s) { uint32_t cell_count; io.read(&cell_count, sizeof(cell_count)); @@ -2094,6 +2147,17 @@ void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama throw std::runtime_error("failed to restore kv cache"); } } + + // cells restored -> reinstate n_ref + if (swa_type == LLAMA_SWA_TYPE_REFERENCE) { + if (seq_id == -1) { + for (uint32_t i = 0; i < n_seq_max; ++i) { + n_ref[i] = n_ref_restored[i]; + } + } else { + n_ref[seq_id] = n_ref_restored[seq_id]; + } + } } void llama_kv_cache::state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id) const { From 649864fc6ba2fee8b7800b819808b94ef609dc9e Mon Sep 17 00:00:00 2001 From: Saba Fallah Date: Sat, 27 Jun 2026 13:20:04 +0200 Subject: [PATCH 4/4] deepseek2-ocr: default the V KV-cache to F32 to fix garbled dense tables The DeepSeek-OCR / Unlimited-OCR decoder reads dense layout (e.g. tables) by attending over the always-visible visual prefix. With the default F16 V-cache, those value vectors are truncated enough to garble the output: table headers come out as """ / ">" (reported on #24975), while the official HF reference parses them correctly. The HF reference accumulates attention in F32, so match it by promoting the F16 V-cache default to F32 for LLM_ARCH_DEEPSEEK2OCR. An explicit lower-precision -ctv (e.g. q8_0) is still honored. This is the in-graph equivalent of running with --cache-type-v f32. It is not the cuBLAS compute mode: the headers are emitted deep in autoregressive decode (a mat-vec path that bypasses cuBLAS), so FORCE_CUBLAS_COMPUTE_32F has no effect; F16 V storage/accumulation is what truncates. Verified on the reported image (parses cleanly with no flags) and with tools/mtmd/tests/test-deepseek-ocr.py: all cases pass and improve, no regression (v1 0.2626, v2 0.6877, unlimited 0.1641 CER). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_015dykwunMpwXWxHPVhbjhiK --- src/llama-model.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 9cc5d1aa27e5..2e7b206dd298 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2027,11 +2027,20 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, { // R-SWA runs on one full cache - the REFERENCE mask keeps the // prefix visible, so no eviction and no iswa. + // + // The V cache must be F32. This OCR decoder reads dense layout + // (e.g. tables) by attending over the always-visible visual + // prefix; an F16 V cache truncates those value vectors enough to + // garble the output (table headers come out as """). The HF + // reference accumulates attention in F32, so match it here. We + // promote the F16 default to F32 by default while still honoring + // an explicit lower-precision -ctv (e.g. q8_0). See PR #24975. + const ggml_type type_v = params.type_v == GGML_TYPE_F16 ? GGML_TYPE_F32 : params.type_v; res = new llama_kv_cache( *this, hparams, params.type_k, - params.type_v, + type_v, !cparams.flash_attn, cparams.offload_kqv, cparams.kv_unified,