diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 220240ea952b..edbefcaecda3 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -194,6 +194,14 @@ llama_context::llama_context( cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL; } + // Non-causal models (e.g. embedding/classification encoders such as BERT) cannot generate text. + // A generation request (bounded n_outputs_max) would otherwise trip GGML_ASSERT in output_reserve(); + // fail early with a clear message instead of aborting. + if (!cparams.causal_attn && !llama_model_has_encoder(&model) && params.n_outputs_max != 0) { + throw std::runtime_error("this model is non-causal (e.g. an embedding/classification model) " + "and cannot be used for text generation; use the embedding API (e.g. llama-embedding) instead"); + } + cparams.flash_attn = params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED; cparams.auto_fa = params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 39b7eb218e69..1b3ca45305d2 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1166,6 +1166,11 @@ struct server_context_impl { return false; } + if (ctx_tgt == nullptr) { + SRV_ERR("failed to create context with model, '%s'\n", params_base.model.path.c_str()); + return false; + } + vocab = llama_model_get_vocab(model_tgt); n_ctx = llama_n_ctx(ctx_tgt);