From 03e84331915efe7e9025db51678260d04dec8782 Mon Sep 17 00:00:00 2001 From: liminfei-amd <91481003+liminfei-amd@users.noreply.github.com> Date: Fri, 26 Jun 2026 10:01:03 +0800 Subject: [PATCH] llama : error clearly when a non-causal model is used for generation Loading an embedding/classification model (e.g. BERT/DistilRoBERTa) in a generation tool such as llama-cli currently aborts with GGML_ASSERT(n_outputs_max <= cparams.n_outputs_max) in output_reserve(), because non-causal models emit one output row per token while generation requests a bounded n_outputs_max. Fail early during llama_context construction with a clear message that points to the embedding API (e.g. llama-embedding), instead of asserting. Also add a missing null-context check in the shared server/cli load path so the tool exits cleanly rather than dereferencing a null context. Fixes #24967 Signed-off-by: liminfei-amd <91481003+liminfei-amd@users.noreply.github.com> --- src/llama-context.cpp | 8 ++++++++ tools/server/server-context.cpp | 5 +++++ 2 files changed, 13 insertions(+) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 220240ea952b..edbefcaecda3 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -194,6 +194,14 @@ llama_context::llama_context( cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; } + // Non-causal models (e.g. embedding/classification encoders such as BERT) cannot generate text. + // A generation request (bounded n_outputs_max) would otherwise trip GGML_ASSERT in output_reserve(); + // fail early with a clear message instead of aborting. + if (!cparams.causal_attn && !llama_model_has_encoder(&model) && params.n_outputs_max != 0) { + throw std::runtime_error("this model is non-causal (e.g. an embedding/classification model) " + "and cannot be used for text generation; use the embedding API (e.g. llama-embedding) instead"); + } + cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED; cparams.auto_fa = params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 39b7eb218e69..1b3ca45305d2 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1166,6 +1166,11 @@ struct server_context_impl { return false; } + if (ctx_tgt == nullptr) { + SRV_ERR("failed to create context with model, '%s'\n", params_base.model.path.c_str()); + return false; + } + vocab = llama_model_get_vocab(model_tgt); n_ctx = llama_n_ctx(ctx_tgt);