From 03e84331915efe7e9025db51678260d04dec8782 Mon Sep 17 00:00:00 2001
From: liminfei-amd <91481003+liminfei-amd@users.noreply.github.com>
Date: Fri, 26 Jun 2026 10:01:03 +0800
Subject: [PATCH] llama : error clearly when a non-causal model is used for
 generation

Loading an embedding/classification model (e.g. BERT/DistilRoBERTa) in a
generation tool such as llama-cli currently aborts with
GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max) in output_reserve(),
because non-causal models emit one output row per token while generation
requests a bounded n_outputs_max.

Fail early during llama_context construction with a clear message that points
to the embedding API (e.g. llama-embedding), instead of asserting. Also add a
missing null-context check in the shared server/cli load path so the tool
exits cleanly rather than dereferencing a null context.

Fixes #24967

Signed-off-by: liminfei-amd <91481003+liminfei-amd@users.noreply.github.com>
---
 src/llama-context.cpp           | 8 ++++++++
 tools/server/server-context.cpp | 5 +++++
 2 files changed, 13 insertions(+)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 220240ea952b..edbefcaecda3 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -194,6 +194,14 @@ llama_context::llama_context(
         cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
     }
 
+    // Non-causal models (e.g. embedding/classification encoders such as BERT) cannot generate text.
+    // A generation request (bounded n_outputs_max) would otherwise trip GGML_ASSERT in output_reserve();
+    // fail early with a clear message instead of aborting.
+    if (!cparams.causal_attn && !llama_model_has_encoder(&model) && params.n_outputs_max != 0) {
+        throw std::runtime_error("this model is non-causal (e.g. an embedding/classification model) "
+            "and cannot be used for text generation; use the embedding API (e.g. llama-embedding) instead");
+    }
+
     cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED;
     cparams.auto_fa    = params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO;
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 39b7eb218e69..1b3ca45305d2 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -1166,6 +1166,11 @@ struct server_context_impl {
             return false;
         }
 
+        if (ctx_tgt == nullptr) {
+            SRV_ERR("failed to create context with model, '%s'\n", params_base.model.path.c_str());
+            return false;
+        }
+
         vocab = llama_model_get_vocab(model_tgt);
 
         n_ctx = llama_n_ctx(ctx_tgt);