From 43072d9c5e0a37fefbafdbd29fbe59f845b8987b Mon Sep 17 00:00:00 2001 From: jhsmith Date: Thu, 18 Jun 2026 07:41:02 -0400 Subject: [PATCH] server : restore forwarding of base CLI args to router child instances In router mode, parent llama-server CLI flags that aren't derivable from the preset .ini (e.g. --parallel, --cache-type-k/v, --flash-attn, -ngl, --threads, --numa) stopped reaching spawned child instances after #23976. Children fell back to defaults -- most damagingly --parallel's new default of -1=auto, which resolves to n_parallel=4 and multiplies the KV cache, OOMing models whose ctx-size was tuned for a single slot. The refactor dropped the final `final_presets[*].merge(base_preset)` pass that gave the parent's CLI args highest precedence. Restore it. base_preset and common_preset::merge() are unchanged, so this re-establishes the pre-#23976 behavior. Fixes #24762 (and the same-symptom #24735). --- tools/server/server-models.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index ff9a0df12f4b..985bd220484f 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -351,6 +351,11 @@ void server_models::load_models() { source_map[name] = SERVER_MODEL_SOURCE_PRESET; } + // server base preset from CLI args takes highest precedence + for (auto & [name, preset] : final_presets) { + preset.merge(base_preset); + } + auto get_source = [&](const std::string & name) { return source_map.count(name) ? source_map.at(name) : SERVER_MODEL_SOURCE_PRESET; };