From 28536fa017dc6249e98aa1533efe076e2eea0ee3 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Fri, 12 Jun 2026 20:05:57 +0200
Subject: [PATCH 1/2] cli : rewrite as HTTP client with MVC architecture

The CLI no longer links server-context; it always talks to llama-server
over HTTP. Following the MVC direction proposed in the PR review:

- cli_client (model): thin HTTP/SSE client for the llama-server API,
  data only, real-time responses delivered via callbacks
- cli_context (controller): owns the chat state and the interactive
  loop, renders through the view
- cli_view (view): user-facing input/output interface, implemented on
  top of common/console
- cli_server: optional local llama-server child process, managed with
  the same stdin/stdout protocol the server router mode uses for model
  instances (ready signal on stdout, exit on stdin EOF, kill on
  timeout), so no orphan process is left behind in either direction

By default llama-cli spawns a local llama-server on a free port and
forwards all server-relevant args to it; with --server-base URL it
connects to an external instance instead and model args are not
required. Media files are sent as base64 content parts, and chat
templating, reasoning parsing and sampling are now handled server-side.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 common/arg.cpp              |  18 +-
 common/common.h             |   2 +
 tools/cli/CMakeLists.txt    |  16 +-
 tools/cli/cli-client.cpp    | 141 +++++++++
 tools/cli/cli-client.h      |  57 ++++
 tools/cli/cli-context.cpp   | 586 ++++++++++++++++++++++++++++++++++++
 tools/cli/cli-context.h     |  65 ++++
 tools/cli/cli-server.cpp    | 336 +++++++++++++++++++++
 tools/cli/cli-server.h      |  66 ++++
 tools/cli/cli-view.cpp      |  63 ++++
 tools/cli/cli-view.h        |  61 ++++
 tools/cli/cli.cpp           | 507 ++-----------------------------
 tools/server/CMakeLists.txt |   5 +
 13 files changed, 1431 insertions(+), 492 deletions(-)
 create mode 100644 tools/cli/cli-client.cpp
 create mode 100644 tools/cli/cli-client.h
 create mode 100644 tools/cli/cli-context.cpp
 create mode 100644 tools/cli/cli-context.h
 create mode 100644 tools/cli/cli-server.cpp
 create mode 100644 tools/cli/cli-server.h
 create mode 100644 tools/cli/cli-view.cpp
 create mode 100644 tools/cli/cli-view.h

diff --git a/common/arg.cpp b/common/arg.cpp
index 55795d357d90..f58f0a733b58 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -635,14 +635,17 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
     }
 
+    // with --server-base, the model is provided by the external server
+    const bool has_remote_server = !params.server_base.empty();
+
     // handle model and download
-    if (!skip_model_download) {
+    if (!skip_model_download && !has_remote_server) {
         common_params_handle_models(params, ctx_arg.ex);
     }
 
-    // model is required (except for server)
+    // model is required (except for server, or when an external server provides it)
     // TODO @ngxson : maybe show a list of available models in CLI in this case
-    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion) {
+    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion && !has_remote_server) {
         throw std::invalid_argument("error: --model is required\n");
     }
 
@@ -1547,6 +1550,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.single_turn = true;
         }
     ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
+    add_opt(common_arg(
+        {"--server-base"}, "URL",
+        "base url of an external llama-server instance to connect to, e.g. http://localhost:8080\n"
+        "when set, llama-cli does not spawn a local server and model args are ignored\n"
+        "(default: unset, spawn a local llama-server)",
+        [](common_params & params, const std::string & value) {
+            params.server_base = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SERVER_BASE"));
     add_opt(common_arg(
         {"-i", "--interactive"},
         string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
diff --git a/common/common.h b/common/common.h
index 4864186f6287..58cf2f57943e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -563,6 +563,8 @@ struct common_params {
 
     bool single_turn       = false; // single turn chat conversation
 
+    std::string server_base = ""; // base url of an external llama-server to connect to (CLI) // NOLINT
+
     ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
     ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
 
diff --git a/tools/cli/CMakeLists.txt b/tools/cli/CMakeLists.txt
index a3e635719b67..fd256fa6f9e5 100644
--- a/tools/cli/CMakeLists.txt
+++ b/tools/cli/CMakeLists.txt
@@ -2,11 +2,21 @@
 
 set(TARGET llama-cli-impl)
 
-add_library(${TARGET} cli.cpp)
+add_library(${TARGET}
+    cli.cpp
+    cli-client.cpp
+    cli-client.h
+    cli-context.cpp
+    cli-context.h
+    cli-server.cpp
+    cli-server.h
+    cli-view.cpp
+    cli-view.h
+)
 set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
-target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ../server)
-target_link_libraries(${TARGET} PUBLIC server-context llama-common ${CMAKE_THREAD_LIBS_INIT})
+target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(${TARGET} PUBLIC llama-common cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
 
 if(LLAMA_TOOLS_INSTALL)
     install(TARGETS ${TARGET} LIBRARY)
diff --git a/tools/cli/cli-client.cpp b/tools/cli/cli-client.cpp
new file mode 100644
index 000000000000..d45affba931c
--- /dev/null
+++ b/tools/cli/cli-client.cpp
@@ -0,0 +1,141 @@
+#include "cli-client.h"
+
+#include "http.h"
+
+#include <algorithm>
+#include <chrono>
+#include <thread>
+
+// generation can stall for a long time during prompt processing, so the
+// read timeout must be generous
+static constexpr time_t CLI_HTTP_READ_TIMEOUT_SEC = 3600;
+
+// upper bound for the accumulated response body kept for error reporting
+static constexpr size_t CLI_HTTP_MAX_ERROR_BODY = 1024 * 1024;
+
+// returns the path with the base url's path prefix prepended (if any)
+static std::string join_path(const common_http_url & parts, const std::string & path) {
+    if (parts.path.empty() || parts.path == "/") {
+        return path;
+    }
+    std::string prefix = parts.path;
+    if (prefix.back() == '/') {
+        prefix.pop_back();
+    }
+    return prefix + path;
+}
+
+json cli_client::get(const std::string & path) {
+    auto [cli, parts] = common_http_client(server_base);
+    cli.set_read_timeout(CLI_HTTP_READ_TIMEOUT_SEC, 0);
+    auto res = cli.Get(join_path(parts, path));
+    if (!res) {
+        throw std::runtime_error("failed to connect to " + server_base + ": " + httplib::to_string(res.error()));
+    }
+    if (res->status < 200 || res->status >= 300) {
+        throw std::runtime_error("GET " + path + " failed with status " + std::to_string(res->status) + ": " + res->body);
+    }
+    json result = json::parse(res->body, nullptr, false);
+    if (result.is_discarded()) {
+        throw std::runtime_error("GET " + path + " returned invalid JSON");
+    }
+    return result;
+}
+
+json cli_client::post(const std::string & path, const json & body) {
+    auto [cli, parts] = common_http_client(server_base);
+    cli.set_read_timeout(CLI_HTTP_READ_TIMEOUT_SEC, 0);
+    auto res = cli.Post(join_path(parts, path), body.dump(), "application/json");
+    if (!res) {
+        throw std::runtime_error("failed to connect to " + server_base + ": " + httplib::to_string(res.error()));
+    }
+    if (res->status < 200 || res->status >= 300) {
+        throw std::runtime_error("POST " + path + " failed with status " + std::to_string(res->status) + ": " + res->body);
+    }
+    json result = json::parse(res->body, nullptr, false);
+    if (result.is_discarded()) {
+        throw std::runtime_error("POST " + path + " returned invalid JSON");
+    }
+    return result;
+}
+
+json cli_client::post_sse(const std::string & path,
+                          const json & body,
+                          const std::function<bool()> & should_stop,
+                          const std::function<void(const json &)> & on_data) {
+    auto [cli, parts] = common_http_client(server_base);
+    cli.set_read_timeout(CLI_HTTP_READ_TIMEOUT_SEC, 0);
+
+    std::string pending;  // buffer for incomplete SSE lines
+    std::string raw_body; // accumulated body, used only for error reporting
+
+    auto receiver = [&](const char * data, size_t len) -> bool {
+        if (should_stop()) {
+            return false; // aborts the request
+        }
+        if (raw_body.size() < CLI_HTTP_MAX_ERROR_BODY) {
+            raw_body.append(data, std::min(len, CLI_HTTP_MAX_ERROR_BODY - raw_body.size()));
+        }
+        pending.append(data, len);
+        size_t pos;
+        while ((pos = pending.find('\n')) != std::string::npos) {
+            std::string line = pending.substr(0, pos);
+            pending.erase(0, pos + 1);
+            if (!line.empty() && line.back() == '\r') {
+                line.pop_back();
+            }
+            if (line.rfind("data: ", 0) != 0) {
+                continue;
+            }
+            std::string payload = line.substr(6);
+            if (payload == "[DONE]") {
+                continue;
+            }
+            json event = json::parse(payload, nullptr, false);
+            if (!event.is_discarded()) {
+                on_data(event);
+            }
+        }
+        return true;
+    };
+
+    httplib::Headers headers = {{"Accept", "text/event-stream"}};
+    auto res = cli.Post(join_path(parts, path), headers, body.dump(), "application/json", receiver);
+
+    if (!res) {
+        if (res.error() == httplib::Error::Canceled && should_stop()) {
+            return json(); // cancelled by the user
+        }
+        return json {{"error", {{"message", "failed to connect to " + server_base + ": " + httplib::to_string(res.error())}}}};
+    }
+    if (res->status < 200 || res->status >= 300) {
+        json error_body = json::parse(raw_body, nullptr, false);
+        if (!error_body.is_discarded() && error_body.contains("error")) {
+            return error_body;
+        }
+        return json {{"error", {{"message", "request failed with status " + std::to_string(res->status)}}}};
+    }
+    return json();
+}
+
+bool cli_client::wait_health(const std::function<bool()> & is_aborted) {
+    int connect_attempts = 0;
+    while (!is_aborted()) {
+        auto [cli, parts] = common_http_client(server_base);
+        cli.set_connection_timeout(1, 0);
+        auto res = cli.Get(join_path(parts, "/health"));
+        if (res) {
+            if (res->status == 200) {
+                return true;
+            }
+            // any other status means the server is up but not ready yet
+            // (e.g. 503 while the model is still loading)
+        } else if (++connect_attempts >= 10) {
+            last_error = "failed to connect to " + server_base + ": " + httplib::to_string(res.error());
+            return false;
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds(300));
+    }
+    last_error = "aborted while waiting for the server to become ready";
+    return false;
+}
diff --git a/tools/cli/cli-client.h b/tools/cli/cli-client.h
new file mode 100644
index 000000000000..7d22cac2493e
--- /dev/null
+++ b/tools/cli/cli-client.h
@@ -0,0 +1,57 @@
+// HTTP API client for llama-cli (the "model" in MVC)
+//
+// a thin client for the llama-server HTTP API, roughly equivalent to the
+// openai python package: it only provides data and never touches the view;
+// real-time responses are delivered through callbacks
+
+#pragma once
+
+#include "ggml.h" // for GGML_ASSERT
+
+#define JSON_ASSERT GGML_ASSERT
+#include <nlohmann/json.hpp>
+
+#include <functional>
+#include <string>
+
+using json = nlohmann::ordered_json;
+
+struct cli_client {
+    std::string server_base; // base url, for example "http://127.0.0.1:8080"
+    std::string last_error;  // set when wait_health() fails
+
+    // simple GET request, returns the response json
+    // throws std::runtime_error on transport error or non-2xx status
+    json get(const std::string & path);
+
+    // simple POST request, returns the response json
+    // throws std::runtime_error on transport error or non-2xx status
+    json post(const std::string & path, const json & body);
+
+    // POST request with an SSE streaming response; on_data is invoked once
+    // per "data:" event; the function returns after the stream is finished:
+    // a null json on graceful exit (incl. cancellation via should_stop),
+    // the error response json otherwise
+    json post_sse(const std::string & path,
+                  const json & body,
+                  const std::function<bool()> & should_stop,
+                  const std::function<void(const json &)> & on_data);
+
+    // poll /health until the server is ready to accept requests
+    // returns false if is_aborted returned true or the server is unreachable
+    bool wait_health(const std::function<bool()> & is_aborted);
+
+    //
+    // higher-level wrappers
+    //
+
+    json create_chat_completion(const json & request,
+                                const std::function<bool()> & should_stop,
+                                const std::function<void(const json &)> & on_data) {
+        return post_sse("/v1/chat/completions", request, should_stop, on_data);
+    }
+
+    json get_props() {
+        return get("/props");
+    }
+};
diff --git a/tools/cli/cli-context.cpp b/tools/cli/cli-context.cpp
new file mode 100644
index 000000000000..fdb7c0c38f4e
--- /dev/null
+++ b/tools/cli/cli-context.cpp
@@ -0,0 +1,586 @@
+#include "cli-context.h"
+
+#include "arg.h"
+#include "base64.hpp"
+#include "log.h"
+
+#include <algorithm>
+#include <filesystem>
+#include <fstream>
+#include <map>
+#include <set>
+
+static const char * LLAMA_ASCII_LOGO = R"(
+▄▄ ▄▄
+██ ██
+██ ██  ▀▀█▄ ███▄███▄  ▀▀█▄    ▄████ ████▄ ████▄
+██ ██ ▄█▀██ ██ ██ ██ ▄█▀██    ██    ██ ██ ██ ██
+██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀
+                                    ██    ██
+                                    ▀▀    ▀▀
+)";
+
+std::atomic<bool> g_cli_interrupted = false;
+
+static bool should_stop() {
+    return g_cli_interrupted.load();
+}
+
+static constexpr size_t FILE_GLOB_MAX_RESULTS = 100;
+
+// number of values an arg consumes on the command line
+static int arg_num_values(const common_arg & opt) {
+    if (opt.value_hint_2 != nullptr) {
+        return 2;
+    }
+    if (opt.value_hint != nullptr) {
+        return 1;
+    }
+    return 0;
+}
+
+// keep only the args that llama-server understands, so that the remainder
+// of the command line can be forwarded to the spawned server child
+static std::vector<std::string> filter_server_args(int argc, char ** argv) {
+    std::map<std::string, int> cli_n_values; // arg -> number of values
+    std::set<std::string>      server_args;
+
+    common_params dummy_cli;
+    auto ctx_cli = common_params_parser_init(dummy_cli, LLAMA_EXAMPLE_CLI);
+    for (const auto & opt : ctx_cli.options) {
+        for (const char * a : opt.args) {
+            cli_n_values[a] = arg_num_values(opt);
+        }
+        for (const char * a : opt.args_neg) {
+            cli_n_values[a] = 0;
+        }
+    }
+
+    common_params dummy_server;
+    auto ctx_server = common_params_parser_init(dummy_server, LLAMA_EXAMPLE_SERVER);
+    for (const auto & opt : ctx_server.options) {
+        for (const char * a : opt.args) {
+            server_args.insert(a);
+        }
+        for (const char * a : opt.args_neg) {
+            server_args.insert(a);
+        }
+    }
+
+    std::vector<std::string> result;
+    for (int i = 1; i < argc; i++) {
+        const std::string arg = argv[i];
+        auto it = cli_n_values.find(arg);
+        if (it == cli_n_values.end()) {
+            // not a known arg (should not happen when parsing succeeded)
+            continue;
+        }
+        const bool forward = server_args.count(arg) > 0;
+        if (forward) {
+            result.push_back(arg);
+        }
+        for (int j = 0; j < it->second && i + 1 < argc; j++) {
+            i++;
+            if (forward) {
+                result.push_back(argv[i]);
+            }
+        }
+    }
+    return result;
+}
+
+static std::string format_error_message(const json & err) {
+    if (err.contains("error") && err.at("error").is_object()) {
+        const auto & e = err.at("error");
+        if (e.contains("message") && e.at("message").is_string()) {
+            return e.at("message").get<std::string>();
+        }
+    }
+    return err.dump();
+}
+
+static std::string media_type_from_ext(const std::string & fname) {
+    std::string ext = std::filesystem::path(fname).extension().string();
+    std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+    if (ext == ".wav" || ext == ".mp3") {
+        return "audio";
+    }
+    if (ext == ".mp4" || ext == ".avi" || ext == ".mkv" || ext == ".mov" || ext == ".webm") {
+        return "video";
+    }
+    return "image";
+}
+
+bool cli_context::init(int argc, char ** argv) {
+    if (!params.server_base.empty()) {
+        std::string base = params.server_base;
+        while (!base.empty() && base.back() == '/') {
+            base.pop_back();
+        }
+        client.server_base = base;
+
+        view.print("Connecting to " + client.server_base + " ... ");
+        view.spinner_start();
+    } else {
+        if (params.model.path.empty() && params.model.url.empty() &&
+                params.model.hf_repo.empty() && params.model.docker_repo.empty()) {
+            view.print_error("no model specified\n");
+            view.print("use -m <file.gguf> or -hf <user/repo> to run a local model,\n"
+                       "or --server-base <url> to connect to a running llama-server\n");
+            return false;
+        }
+
+        const bool pass_output = params.verbosity >= LOG_LEVEL_INFO;
+
+        view.print("Loading model... ");
+        view.spinner_start();
+
+        server.emplace();
+        if (!server->start(filter_server_args(argc, argv), pass_output)) {
+            view.spinner_stop();
+            view.print_error("\n" + server->last_error + "\n");
+            return false;
+        }
+        if (!server->wait_ready(should_stop)) {
+            view.spinner_stop();
+            if (!should_stop()) {
+                view.print_error("\nthe server exited before becoming ready\n");
+                if (!pass_output) {
+                    view.print(server->recent_output());
+                }
+            }
+            return false;
+        }
+        client.server_base = server->address();
+    }
+
+    // for --server-base this is the main availability check; for a spawned
+    // server it is a cheap sanity check on top of the ready signal
+    auto is_aborted = [this]() {
+        return should_stop() || (server && !server->alive());
+    };
+    bool healthy = false;
+    try {
+        healthy = client.wait_health(is_aborted);
+    } catch (const std::exception & e) {
+        client.last_error = e.what();
+    }
+    if (!healthy) {
+        view.spinner_stop();
+        if (!should_stop()) {
+            view.print_error("\n" + client.last_error + "\n");
+        }
+        return false;
+    }
+
+    fetch_server_props();
+
+    view.spinner_stop();
+    view.print("\n");
+
+    return true;
+}
+
+void cli_context::fetch_server_props() {
+    try {
+        json props = client.get_props();
+        model_name = props.value("model_alias", "");
+        if (model_name.empty()) {
+            const std::string path = props.value("model_path", "");
+            if (!path.empty()) {
+                model_name = std::filesystem::path(path).filename().string();
+            }
+        }
+        build_info = props.value("build_info", "");
+        if (props.contains("modalities") && props.at("modalities").is_object()) {
+            const auto & modalities = props.at("modalities");
+            has_vision = modalities.value("vision", false);
+            has_audio  = modalities.value("audio", false);
+            has_video  = modalities.value("video", false);
+        }
+    } catch (const std::exception & e) {
+        // /props can be disabled on remote servers; not fatal
+        LOG_DBG("failed to fetch /props: %s\n", e.what());
+    }
+}
+
+void cli_context::add_system_prompt() {
+    if (!params.system_prompt.empty()) {
+        messages.push_back({
+            {"role",    "system"},
+            {"content", params.system_prompt}
+        });
+    }
+}
+
+void cli_context::push_user_message(const std::string & text) {
+    json content;
+    if (pending_media.empty()) {
+        content = text;
+    } else {
+        // multimodal message: media parts first, then the text
+        content = pending_media;
+        content.push_back({
+            {"type", "text"},
+            {"text", text}
+        });
+        pending_media = json::array();
+    }
+    messages.push_back({
+        {"role",    "user"},
+        {"content", content}
+    });
+}
+
+bool cli_context::stage_media_file(const std::string & fname, const std::string & type) {
+    std::ifstream file(fname, std::ios::binary);
+    if (!file) {
+        return false;
+    }
+    std::string data((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+    std::string encoded = base64::encode(data);
+
+    if (type == "audio") {
+        std::string ext = std::filesystem::path(fname).extension().string();
+        std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+        pending_media.push_back({
+            {"type", "input_audio"},
+            {"input_audio", {
+                {"data",   encoded},
+                {"format", ext == ".mp3" ? "mp3" : "wav"}
+            }}
+        });
+    } else if (type == "video") {
+        pending_media.push_back({
+            {"type", "input_video"},
+            {"input_video", {
+                {"data", encoded}
+            }}
+        });
+    } else {
+        // the server detects the actual image type from the data
+        pending_media.push_back({
+            {"type", "image_url"},
+            {"image_url", {
+                {"url", "data:image/unknown;base64," + encoded}
+            }}
+        });
+    }
+    return true;
+}
+
+bool cli_context::generate_completion(std::string & assistant_content, cli_timings & timings) {
+    json body = {
+        {"messages",          messages},
+        {"stream",            true},
+        // in order to get timings even when we cancel mid-way
+        {"timings_per_token", true},
+    };
+
+    bool is_thinking   = false;
+    bool spinner_alive = true;
+    bool stream_error  = false;
+
+    auto stop_spinner = [&]() {
+        if (spinner_alive) {
+            spinner_alive = false;
+            view.spinner_stop();
+        }
+    };
+
+    view.spinner_start();
+
+    json err = client.create_chat_completion(body, should_stop, [&](const json & chunk) {
+        if (chunk.contains("error")) {
+            stop_spinner();
+            stream_error = true;
+            view.print_error("Error: " + format_error_message(chunk) + "\n");
+            return;
+        }
+        if (chunk.contains("timings")) {
+            const auto & t = chunk.at("timings");
+            timings.prompt_per_second    = t.value("prompt_per_second",    0.0);
+            timings.predicted_per_second = t.value("predicted_per_second", 0.0);
+        }
+        if (!chunk.contains("choices") || !chunk.at("choices").is_array() || chunk.at("choices").empty()) {
+            return;
+        }
+        const auto & choice = chunk.at("choices").at(0);
+        if (!choice.contains("delta")) {
+            return;
+        }
+        const auto & delta = choice.at("delta");
+        if (delta.contains("reasoning_content") && delta.at("reasoning_content").is_string()) {
+            const std::string text = delta.at("reasoning_content").get<std::string>();
+            if (!text.empty()) {
+                stop_spinner();
+                if (!is_thinking) {
+                    view.print_reasoning("[Start thinking]\n");
+                    is_thinking = true;
+                }
+                view.print_reasoning(text);
+                view.flush();
+            }
+        }
+        if (delta.contains("content") && delta.at("content").is_string()) {
+            const std::string text = delta.at("content").get<std::string>();
+            if (!text.empty()) {
+                stop_spinner();
+                if (is_thinking) {
+                    view.print_reasoning("\n[End thinking]\n\n");
+                    is_thinking = false;
+                }
+                assistant_content += text;
+                view.print(text);
+                view.flush();
+            }
+        }
+    });
+
+    stop_spinner();
+    g_cli_interrupted.store(false);
+
+    if (!err.is_null()) {
+        view.print_error("Error: " + format_error_message(err) + "\n");
+        return false;
+    }
+    return !stream_error;
+}
+
+int cli_context::run() {
+    std::string modalities = "text";
+    if (has_vision) {
+        modalities += ", vision";
+    }
+    if (has_audio) {
+        modalities += ", audio";
+    }
+    if (has_video) {
+        modalities += ", video";
+    }
+
+    add_system_prompt();
+
+    view.print("\n");
+    view.print(LLAMA_ASCII_LOGO);
+    view.print("\n");
+    if (!build_info.empty()) {
+        view.print(string_format("build      : %s\n", build_info.c_str()));
+    }
+    view.print(string_format("model      : %s\n", model_name.empty() ? "(unknown)" : model_name.c_str()));
+    view.print(string_format("server     : %s%s\n", client.server_base.c_str(), server ? " (managed by llama-cli)" : ""));
+    view.print(string_format("modalities : %s\n", modalities.c_str()));
+    if (!params.system_prompt.empty()) {
+        view.print("using custom system prompt\n");
+    }
+    view.print("\n");
+    view.print("available commands:\n");
+    view.print("  /exit or Ctrl+C     stop or exit\n");
+    view.print("  /regen              regenerate the last response\n");
+    view.print("  /clear              clear the chat history\n");
+    view.print("  /read <file>        add a text file\n");
+    view.print("  /glob <pattern>     add text files using globbing pattern\n");
+    if (has_vision) {
+        view.print("  /image <file>       add an image file\n");
+    }
+    if (has_audio) {
+        view.print("  /audio <file>       add an audio file\n");
+    }
+    if (has_video) {
+        view.print("  /video <file>       add a video file\n");
+    }
+    view.print("\n");
+
+    // interactive loop
+    std::string cur_msg;
+
+    auto add_text_file = [&](const std::string & fname) -> bool {
+        std::ifstream file(fname, std::ios::binary);
+        if (!file) {
+            view.print_error(string_format("file does not exist or cannot be opened: '%s'\n", fname.c_str()));
+            return false;
+        }
+        std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+        cur_msg += "--- File: ";
+        cur_msg += fname;
+        cur_msg += " ---\n";
+        cur_msg += content;
+        view.print(string_format("Loaded text from '%s'\n", fname.c_str()));
+        return true;
+    };
+
+    while (true) {
+        std::string buffer;
+        if (params.prompt.empty()) {
+            view.print_user("\n> ");
+            std::string line;
+            bool another_line = true;
+            do {
+                another_line = view.readline(line, params.multiline_input);
+                buffer += line;
+            } while (another_line);
+        } else {
+            // process input prompt from args
+            for (auto & fname : params.image) {
+                if (!stage_media_file(fname, media_type_from_ext(fname))) {
+                    view.print_error(string_format("file does not exist or cannot be opened: '%s'\n", fname.c_str()));
+                    break;
+                }
+                view.print(string_format("Loaded media from '%s'\n", fname.c_str()));
+            }
+            buffer = params.prompt;
+            if (buffer.size() > 500) {
+                view.print_user(string_format("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str()));
+            } else {
+                view.print_user(string_format("\n> %s\n", buffer.c_str()));
+            }
+            params.prompt.clear(); // only use it once
+        }
+        view.print("\n");
+
+        if (should_stop()) {
+            g_cli_interrupted.store(false);
+            break;
+        }
+
+        // remove trailing newline
+        if (!buffer.empty() && buffer.back() == '\n') {
+            buffer.pop_back();
+        }
+
+        // skip empty messages
+        if (buffer.empty()) {
+            continue;
+        }
+
+        bool add_user_msg = true;
+
+        // process commands
+        if (string_starts_with(buffer, "/exit")) {
+            break;
+        } else if (string_starts_with(buffer, "/regen")) {
+            if (messages.size() >= 2) {
+                size_t last_idx = messages.size() - 1;
+                messages.erase(last_idx);
+                add_user_msg = false;
+            } else {
+                view.print_error("No message to regenerate.\n");
+                continue;
+            }
+        } else if (string_starts_with(buffer, "/clear")) {
+            messages.clear();
+            add_system_prompt();
+
+            pending_media = json::array();
+            view.print("Chat history cleared.\n");
+            continue;
+        } else if (
+                (string_starts_with(buffer, "/image ") && has_vision) ||
+                (string_starts_with(buffer, "/audio ") && has_audio) ||
+                (string_starts_with(buffer, "/video ") && has_video)) {
+            std::string type = buffer.substr(1, 5);
+            // just in case (bad copy-paste for example), we strip all trailing/leading spaces
+            std::string fname = string_strip(buffer.substr(7));
+            if (!stage_media_file(fname, type)) {
+                view.print_error(string_format("file does not exist or cannot be opened: '%s'\n", fname.c_str()));
+                continue;
+            }
+            view.print(string_format("Loaded media from '%s'\n", fname.c_str()));
+            continue;
+        } else if (string_starts_with(buffer, "/read ")) {
+            std::string fname = string_strip(buffer.substr(6));
+            add_text_file(fname);
+            continue;
+        } else if (string_starts_with(buffer, "/glob ")) {
+            std::error_code ec;
+            size_t count = 0;
+            auto curdir = std::filesystem::current_path();
+            std::string pattern = string_strip(buffer.substr(6));
+            std::filesystem::path rel_path;
+
+            auto startglob = pattern.find_first_of("![*?");
+            if (startglob != std::string::npos && startglob != 0) {
+                auto endpath = pattern.substr(0, startglob).find_last_of('/');
+                if (endpath != std::string::npos) {
+                    std::string rel_pattern = pattern.substr(0, endpath);
+#if !defined(_WIN32)
+                    if (string_starts_with(rel_pattern, '~')) {
+                        const char * home = std::getenv("HOME");
+                        if (home && home[0]) {
+                            rel_pattern = home + rel_pattern.substr(1);
+                        }
+                    }
+#endif
+                    rel_path = rel_pattern;
+                    pattern.erase(0, endpath + 1);
+                    curdir /= rel_path;
+                }
+            }
+
+            for (const auto & entry : std::filesystem::recursive_directory_iterator(curdir,
+                    std::filesystem::directory_options::skip_permission_denied, ec)) {
+                if (!entry.is_regular_file()) {
+                    continue;
+                }
+
+                std::string rel = std::filesystem::relative(entry.path(), curdir, ec).string();
+                if (ec) {
+                    ec.clear();
+                    continue;
+                }
+                std::replace(rel.begin(), rel.end(), '\\', '/');
+
+                if (!glob_match(pattern, rel)) {
+                    continue;
+                }
+
+                if (!add_text_file((rel_path / rel).string())) {
+                    continue;
+                }
+
+                if (++count >= FILE_GLOB_MAX_RESULTS) {
+                    view.print_error(string_format("Maximum number of globbed files allowed (%zu) reached.\n", FILE_GLOB_MAX_RESULTS));
+                    break;
+                }
+            }
+            continue;
+        } else {
+            // not a command
+            cur_msg += buffer;
+        }
+
+        // generate response
+        if (add_user_msg) {
+            push_user_message(cur_msg);
+            cur_msg.clear();
+        }
+        cli_timings timings;
+        std::string assistant_content;
+        generate_completion(assistant_content, timings);
+        messages.push_back({
+            {"role",    "assistant"},
+            {"content", assistant_content}
+        });
+        view.print("\n");
+
+        if (params.show_timings) {
+            view.print_info(string_format("\n[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n",
+                    timings.prompt_per_second, timings.predicted_per_second));
+        }
+
+        if (params.single_turn) {
+            break;
+        }
+    }
+
+    view.print("\nExiting...\n");
+
+    return 0;
+}
+
+void cli_context::shutdown() {
+    if (server) {
+        server->stop();
+        server.reset();
+    }
+}
diff --git a/tools/cli/cli-context.h b/tools/cli/cli-context.h
new file mode 100644
index 000000000000..a7f725ee76f6
--- /dev/null
+++ b/tools/cli/cli-context.h
@@ -0,0 +1,65 @@
+// controller for llama-cli (the "controller" in MVC)
+//
+// owns the chat state, drives the view and talks to llama-server through
+// cli_client; when no --server-base is given it also manages a local
+// llama-server child process via cli_server
+
+#pragma once
+
+#include "common.h"
+
+#include "cli-client.h"
+#include "cli-server.h"
+#include "cli-view.h"
+
+#include <atomic>
+#include <optional>
+#include <string>
+
+// set by the SIGINT handler; cleared once the interrupt has been handled
+extern std::atomic<bool> g_cli_interrupted;
+
+struct cli_timings {
+    double prompt_per_second    = 0.0;
+    double predicted_per_second = 0.0;
+};
+
+struct cli_context {
+    common_params params;
+
+    cli_view & view;
+    cli_client client;                // always initialized
+    std::optional<cli_server> server; // only set when no --server-base is given
+
+    json messages      = json::array();
+    json pending_media = json::array(); // staged multimodal content parts
+
+    // properties of the connected server
+    std::string model_name;
+    std::string build_info;
+    bool has_vision = false;
+    bool has_audio  = false;
+    bool has_video  = false;
+
+    cli_context(const common_params & params, cli_view & view) : params(params), view(view) {}
+
+    // connect to --server-base or spawn a local llama-server child;
+    // argc/argv are needed to forward the server-relevant args to the child
+    bool init(int argc, char ** argv);
+
+    // run the interactive chat loop, returns the process exit code
+    int run();
+
+    // stop the local server child (if any)
+    void shutdown();
+
+private:
+    bool generate_completion(std::string & assistant_content, cli_timings & timings);
+    void fetch_server_props();
+    void add_system_prompt();
+    void push_user_message(const std::string & text);
+
+    // read a file and stage it as a multimodal content part; type is one of
+    // "image", "audio", "video"; returns false if the file cannot be read
+    bool stage_media_file(const std::string & fname, const std::string & type);
+};
diff --git a/tools/cli/cli-server.cpp b/tools/cli/cli-server.cpp
new file mode 100644
index 000000000000..1914169bfe57
--- /dev/null
+++ b/tools/cli/cli-server.cpp
@@ -0,0 +1,336 @@
+#include "cli-server.h"
+
+#include <sheredom/subprocess.h>
+
+#include <chrono>
+#include <cstring>
+#include <filesystem>
+#include <stdexcept>
+#include <vector>
+
+#ifdef _WIN32
+#include <winsock2.h>
+#include <windows.h>
+#else
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+extern char **environ;
+#endif
+
+#if defined(__APPLE__) && defined(__MACH__)
+#include <mach-o/dyld.h>
+#include <limits.h>
+#endif
+
+#if defined(__linux__)
+#include <limits.h>
+#endif
+
+// stdin/stdout command protocol between the CLI and the spawned server,
+// keep in sync with tools/server/server-models.cpp
+#define CMD_ROUTER_TO_CHILD_EXIT   "cmd_router_to_child:exit"
+#define CMD_CHILD_TO_ROUTER_READY  "cmd_child_to_router:ready"
+#define CMD_CHILD_TO_ROUTER_PREFIX "cmd_child_to_router:"
+
+// address for the spawned server; always loopback, never exposed
+#define CHILD_ADDR "127.0.0.1"
+
+static constexpr int    CLI_SERVER_STOP_TIMEOUT_SEC = 10;
+static constexpr size_t CLI_SERVER_MAX_LOG_LINES    = 200;
+
+static bool line_starts_with(const std::string & line, const char * prefix) {
+    return line.rfind(prefix, 0) == 0;
+}
+
+// same logic as get_free_port() in tools/server/server-models.cpp
+static int get_free_port() {
+#ifdef _WIN32
+    WSADATA wsaData;
+    if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
+        return -1;
+    }
+    typedef SOCKET native_socket_t;
+#define INVALID_SOCKET_VAL INVALID_SOCKET
+#define CLOSE_SOCKET(s) closesocket(s)
+#else
+    typedef int native_socket_t;
+#define INVALID_SOCKET_VAL -1
+#define CLOSE_SOCKET(s) close(s)
+#endif
+
+    native_socket_t sock = socket(AF_INET, SOCK_STREAM, 0);
+    if (sock == INVALID_SOCKET_VAL) {
+#ifdef _WIN32
+        WSACleanup();
+#endif
+        return -1;
+    }
+
+    int port = -1;
+
+    struct sockaddr_in serv_addr;
+    std::memset(&serv_addr, 0, sizeof(serv_addr));
+    serv_addr.sin_family = AF_INET;
+    serv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+    serv_addr.sin_port = htons(0);
+
+#ifdef _WIN32
+    int namelen = sizeof(serv_addr);
+#else
+    socklen_t namelen = sizeof(serv_addr);
+#endif
+    if (bind(sock, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) == 0 &&
+            getsockname(sock, (struct sockaddr *) &serv_addr, &namelen) == 0) {
+        port = ntohs(serv_addr.sin_port);
+    }
+
+    CLOSE_SOCKET(sock);
+#ifdef _WIN32
+    WSACleanup();
+#endif
+
+    return port;
+}
+
+#ifdef _WIN32
+static std::string wide_to_utf8(const wchar_t * ws) {
+    if (!ws || !*ws) {
+        return {};
+    }
+    int len = WideCharToMultiByte(CP_UTF8, 0, ws, -1, nullptr, 0, nullptr, nullptr);
+    std::string out(len > 0 ? len - 1 : 0, '\0');
+    if (len > 1) {
+        WideCharToMultiByte(CP_UTF8, 0, ws, -1, out.data(), len, nullptr, nullptr);
+    }
+    return out;
+}
+#endif
+
+// same logic as get_environment() in tools/server/server-models.cpp
+static std::vector<std::string> get_environment() {
+    std::vector<std::string> env;
+
+#ifdef _WIN32
+    LPWCH env_block = GetEnvironmentStringsW();
+    if (!env_block) {
+        return env;
+    }
+    for (LPWCH e = env_block; *e; e += wcslen(e) + 1) {
+        env.emplace_back(wide_to_utf8(e));
+    }
+    FreeEnvironmentStringsW(env_block);
+#else
+    if (environ == nullptr) {
+        return env;
+    }
+    for (char ** e = environ; *e != nullptr; e++) {
+        env.emplace_back(*e);
+    }
+#endif
+
+    return env;
+}
+
+// same logic as get_server_exec_path() in tools/server/server-models.cpp,
+// but resolving the llama-server binary next to the current executable
+static std::filesystem::path get_server_bin_path() {
+#if defined(_WIN32)
+    wchar_t buf[32768] = { 0 };
+    DWORD len = GetModuleFileNameW(nullptr, buf, _countof(buf));
+    if (len == 0 || len >= _countof(buf)) {
+        throw std::runtime_error("GetModuleFileNameW failed or path too long");
+    }
+    std::filesystem::path self_path(buf);
+    return self_path.parent_path() / "llama-server.exe";
+#elif defined(__APPLE__) && defined(__MACH__)
+    char small_path[PATH_MAX];
+    uint32_t size = sizeof(small_path);
+    std::filesystem::path self_path;
+    if (_NSGetExecutablePath(small_path, &size) == 0) {
+        self_path = std::filesystem::path(small_path);
+    } else {
+        std::vector<char> buf(size);
+        if (_NSGetExecutablePath(buf.data(), &size) != 0) {
+            throw std::runtime_error("_NSGetExecutablePath failed after buffer resize");
+        }
+        self_path = std::filesystem::path(buf.data());
+    }
+    try {
+        self_path = std::filesystem::canonical(self_path);
+    } catch (...) {
+        // ignore, use the raw path
+    }
+    return self_path.parent_path() / "llama-server";
+#else
+    char path[FILENAME_MAX];
+    ssize_t count = readlink("/proc/self/exe", path, FILENAME_MAX);
+    if (count <= 0) {
+        throw std::runtime_error("failed to resolve /proc/self/exe");
+    }
+    std::filesystem::path self_path(std::string(path, count));
+    return self_path.parent_path() / "llama-server";
+#endif
+}
+
+// helper to convert vector<string> to char **
+// pointers are only valid as long as the original vector is valid
+static std::vector<char *> to_char_ptr_array(const std::vector<std::string> & vec) {
+    std::vector<char *> result;
+    result.reserve(vec.size() + 1);
+    for (const auto & s : vec) {
+        result.push_back(const_cast<char *>(s.c_str()));
+    }
+    result.push_back(nullptr);
+    return result;
+}
+
+cli_server::~cli_server() {
+    stop();
+}
+
+bool cli_server::start(const std::vector<std::string> & args, bool pass_output_) {
+    pass_output = pass_output_;
+
+    std::filesystem::path bin_path;
+    try {
+        bin_path = get_server_bin_path();
+    } catch (const std::exception & e) {
+        last_error = e.what();
+        return false;
+    }
+
+    std::error_code ec;
+    if (!std::filesystem::exists(bin_path, ec)) {
+        last_error = "llama-server binary not found at " + bin_path.string() +
+                     "\nllama-cli requires llama-server to run a local model";
+        return false;
+    }
+
+    port = get_free_port();
+    if (port <= 0) {
+        last_error = "failed to get a free port number";
+        return false;
+    }
+
+    std::vector<std::string> child_args;
+    child_args.push_back(bin_path.string());
+    child_args.insert(child_args.end(), args.begin(), args.end());
+    child_args.push_back("--host");
+    child_args.push_back(CHILD_ADDR);
+    child_args.push_back("--port");
+    child_args.push_back(std::to_string(port));
+
+    std::vector<std::string> child_env = get_environment();
+    // make the server run in child mode: it will report readiness on stdout
+    // and exit as soon as its stdin reaches EOF (the value is unused)
+    child_env.push_back("LLAMA_SERVER_ROUTER_PORT=0");
+
+    std::vector<char *> argv = to_char_ptr_array(child_args);
+    std::vector<char *> envp = to_char_ptr_array(child_env);
+
+    subproc = std::make_shared<subprocess_s>();
+    int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr;
+    if (subprocess_create_ex(argv.data(), options, envp.data(), subproc.get()) != 0) {
+        last_error = "failed to spawn " + bin_path.string();
+        subproc.reset();
+        return false;
+    }
+
+    started     = true;
+    child_stdin = subprocess_stdin(subproc.get());
+
+    log_thread = std::thread([this]() {
+        FILE * child_stdout = subprocess_stdout(subproc.get());
+        std::vector<char> buf(128 * 1024);
+        if (child_stdout) {
+            while (fgets(buf.data(), buf.size(), child_stdout) != nullptr) {
+                std::string line(buf.data());
+                if (line_starts_with(line, CMD_CHILD_TO_ROUTER_READY)) {
+                    {
+                        std::lock_guard<std::mutex> lk(mtx);
+                        ready.store(true);
+                    }
+                    cv.notify_all();
+                } else if (!line_starts_with(line, CMD_CHILD_TO_ROUTER_PREFIX)) {
+                    {
+                        std::lock_guard<std::mutex> lk(mtx);
+                        output_lines.push_back(line);
+                        if (output_lines.size() > CLI_SERVER_MAX_LOG_LINES) {
+                            output_lines.pop_front();
+                        }
+                    }
+                    if (pass_output) {
+                        fputs(line.c_str(), stderr);
+                    }
+                }
+            }
+        }
+        // EOF means the child exited (or crashed)
+        {
+            std::lock_guard<std::mutex> lk(mtx);
+            exited.store(true);
+        }
+        cv.notify_all();
+    });
+
+    return true;
+}
+
+bool cli_server::wait_ready(const std::function<bool()> & is_aborted) {
+    std::unique_lock<std::mutex> lk(mtx);
+    while (!ready.load() && !exited.load()) {
+        if (is_aborted()) {
+            return false;
+        }
+        cv.wait_for(lk, std::chrono::milliseconds(100));
+    }
+    return ready.load();
+}
+
+void cli_server::stop() {
+    if (!started) {
+        return;
+    }
+
+    if (!exited.load() && child_stdin != nullptr) {
+        fprintf(child_stdin, "%s\n", CMD_ROUTER_TO_CHILD_EXIT);
+        fflush(child_stdin);
+
+        // wait for a graceful exit, force-kill after timeout
+        std::unique_lock<std::mutex> lk(mtx);
+        cv.wait_for(lk, std::chrono::seconds(CLI_SERVER_STOP_TIMEOUT_SEC), [this]() {
+            return exited.load();
+        });
+    }
+
+    if (!exited.load()) {
+        subprocess_terminate(subproc.get());
+    }
+
+    if (log_thread.joinable()) {
+        log_thread.join();
+    }
+
+    int exit_code = 0;
+    subprocess_join(subproc.get(), &exit_code);
+    subprocess_destroy(subproc.get());
+
+    started     = false;
+    child_stdin = nullptr;
+    subproc.reset();
+}
+
+std::string cli_server::address() const {
+    return std::string("http://") + CHILD_ADDR + ":" + std::to_string(port);
+}
+
+std::string cli_server::recent_output() const {
+    std::lock_guard<std::mutex> lk(mtx);
+    std::string out;
+    for (const auto & line : output_lines) {
+        out += line;
+    }
+    return out;
+}
diff --git a/tools/cli/cli-server.h b/tools/cli/cli-server.h
new file mode 100644
index 000000000000..758f3ec4c957
--- /dev/null
+++ b/tools/cli/cli-server.h
@@ -0,0 +1,66 @@
+// local llama-server process management for llama-cli
+//
+// when no --server-base is given, the CLI spawns a llama-server child process
+// and talks to it over HTTP; the child lifetime is managed the same way the
+// server router mode manages model instances:
+// - the child is spawned with LLAMA_SERVER_ROUTER_PORT set, which makes it
+//   watch its stdin and exit on EOF, so no orphan is left behind if the CLI
+//   dies unexpectedly
+// - the parent reads the child's stdout and waits for the ready line
+// - on stop, the parent sends an exit command on stdin and force-kills the
+//   child after a timeout
+
+#pragma once
+
+#include <atomic>
+#include <condition_variable>
+#include <cstdio>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+struct subprocess_s;
+
+struct cli_server {
+    ~cli_server();
+
+    // spawn llama-server (located next to the current executable) with the
+    // given args on a free port; if pass_output is true, the child output is
+    // forwarded to stderr; returns false on failure (see last_error)
+    bool start(const std::vector<std::string> & args, bool pass_output);
+
+    // wait until the child reports it is ready to accept requests
+    // returns false if the child exited or is_aborted returned true
+    bool wait_ready(const std::function<bool()> & is_aborted);
+
+    // gracefully stop the child process (force-kill after a timeout)
+    void stop();
+
+    bool alive() const { return started && !exited; }
+
+    std::string address() const;
+
+    // last lines of child output, for error reporting
+    std::string recent_output() const;
+
+    std::string last_error;
+    int port = 0;
+
+private:
+    std::shared_ptr<subprocess_s> subproc;
+    FILE * child_stdin = nullptr;
+    std::thread log_thread;
+    bool started     = false;
+    bool pass_output = false;
+
+    std::atomic<bool> ready{false};
+    std::atomic<bool> exited{false};
+
+    mutable std::mutex mtx;
+    std::condition_variable cv;
+    std::deque<std::string> output_lines;
+};
diff --git a/tools/cli/cli-view.cpp b/tools/cli/cli-view.cpp
new file mode 100644
index 000000000000..b1e41174a593
--- /dev/null
+++ b/tools/cli/cli-view.cpp
@@ -0,0 +1,63 @@
+#include "cli-view.h"
+
+void cli_view_console::init(bool simple_io, bool use_color) {
+    console::init(simple_io, use_color);
+    console::set_display(DISPLAY_TYPE_RESET);
+    curr_display = DISPLAY_TYPE_RESET;
+}
+
+void cli_view_console::cleanup() {
+    console::cleanup();
+}
+
+bool cli_view_console::readline(std::string & line, bool multiline_input) {
+    return console::readline(line, multiline_input);
+}
+
+void cli_view_console::set_completion_callback(completion_callback cb) {
+    console::set_completion_callback(std::move(cb));
+}
+
+void cli_view_console::set_display(display_type display) {
+    if (curr_display != display) {
+        console::set_display(display);
+        curr_display = display;
+    }
+}
+
+void cli_view_console::print(const std::string & text) {
+    set_display(DISPLAY_TYPE_RESET);
+    console::log("%s", text.c_str());
+}
+
+void cli_view_console::print_reasoning(const std::string & text) {
+    set_display(DISPLAY_TYPE_REASONING);
+    console::log("%s", text.c_str());
+}
+
+void cli_view_console::print_info(const std::string & text) {
+    set_display(DISPLAY_TYPE_INFO);
+    console::log("%s", text.c_str());
+    set_display(DISPLAY_TYPE_RESET);
+}
+
+void cli_view_console::print_user(const std::string & text) {
+    set_display(DISPLAY_TYPE_USER_INPUT);
+    console::log("%s", text.c_str());
+}
+
+void cli_view_console::print_error(const std::string & text) {
+    console::error("%s", text.c_str()); // restores the current display on its own
+}
+
+void cli_view_console::spinner_start() {
+    console::spinner::start();
+}
+
+void cli_view_console::spinner_stop() {
+    console::spinner::stop();
+}
+
+void cli_view_console::flush() {
+    console::flush();
+}
diff --git a/tools/cli/cli-view.h b/tools/cli/cli-view.h
new file mode 100644
index 000000000000..345f61799e52
--- /dev/null
+++ b/tools/cli/cli-view.h
@@ -0,0 +1,61 @@
+// view layer for llama-cli (the "view" in MVC)
+//
+// the view owns all user-facing input/output; it knows nothing about HTTP,
+// process management or chat state
+
+#pragma once
+
+#include "console.h"
+
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+struct cli_view {
+    // returns matches as (replacement line, cursor position)
+    using completion_callback = std::function<std::vector<std::pair<std::string, size_t>>(std::string_view, size_t)>;
+
+    virtual ~cli_view() = default;
+
+    virtual void init(bool simple_io, bool use_color) = 0;
+    virtual void cleanup() = 0;
+
+    // read a line from the user; returns true if the input continues on another line
+    virtual bool readline(std::string & line, bool multiline_input) = 0;
+    virtual void set_completion_callback(completion_callback cb) = 0;
+
+    virtual void print(const std::string & text) = 0;           // assistant / generic output
+    virtual void print_reasoning(const std::string & text) = 0; // reasoning stream
+    virtual void print_info(const std::string & text) = 0;      // metadata (banner, timings)
+    virtual void print_user(const std::string & text) = 0;      // user input marker / echo
+    virtual void print_error(const std::string & text) = 0;
+
+    virtual void spinner_start() = 0;
+    virtual void spinner_stop() = 0;
+    virtual void flush() = 0;
+};
+
+// cli_view implementation backed by common/console
+struct cli_view_console : cli_view {
+    void init(bool simple_io, bool use_color) override;
+    void cleanup() override;
+
+    bool readline(std::string & line, bool multiline_input) override;
+    void set_completion_callback(completion_callback cb) override;
+
+    void print(const std::string & text) override;
+    void print_reasoning(const std::string & text) override;
+    void print_info(const std::string & text) override;
+    void print_user(const std::string & text) override;
+    void print_error(const std::string & text) override;
+
+    void spinner_start() override;
+    void spinner_stop() override;
+    void flush() override;
+
+private:
+    void set_display(display_type display);
+
+    display_type curr_display = DISPLAY_TYPE_RESET;
+};
diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index 3ed345bf0f0a..d59cbb7ec1bc 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -1,20 +1,14 @@
-#include "chat.h"
-#include "common.h"
 #include "arg.h"
-#include "console.h"
-#include "fit.h"
-// #include "log.h"
+#include "common.h"
+#include "log.h"
 
-#include "server-common.h"
-#include "server-context.h"
-#include "server-task.h"
+#include "cli-context.h"
+#include "cli-view.h"
 
 #include <array>
-#include <atomic>
 #include <algorithm>
 #include <filesystem>
-#include <fstream>
-#include <thread>
+#include <string_view>
 #include <signal.h>
 
 #if defined(_WIN32)
@@ -25,215 +19,19 @@
 #include <windows.h>
 #endif
 
-const char * LLAMA_ASCII_LOGO = R"(
-▄▄ ▄▄
-██ ██
-██ ██  ▀▀█▄ ███▄███▄  ▀▀█▄    ▄████ ████▄ ████▄
-██ ██ ▄█▀██ ██ ██ ██ ▄█▀██    ██    ██ ██ ██ ██
-██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀
-                                    ██    ██
-                                    ▀▀    ▀▀
-)";
-
-static std::atomic<bool> g_is_interrupted = false;
-static bool should_stop() {
-    return g_is_interrupted.load();
-}
-
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 static void signal_handler(int) {
-    if (g_is_interrupted.load()) {
+    if (g_cli_interrupted.load()) {
         // second Ctrl+C - exit immediately
         // make sure to clear colors before exiting (not using LOG or console.cpp here to avoid deadlock)
         fprintf(stdout, "\033[0m\n");
         fflush(stdout);
         std::exit(130);
     }
-    g_is_interrupted.store(true);
+    g_cli_interrupted.store(true);
 }
 #endif
 
-struct cli_context {
-    server_context ctx_server;
-    json messages = json::array();
-    std::vector<raw_buffer> input_files;
-    task_params defaults;
-    bool verbose_prompt;
-
-    // thread for showing "loading" animation
-    std::atomic<bool> loading_show;
-
-    cli_context(const common_params & params) {
-        defaults.sampling    = params.sampling;
-        defaults.speculative = params.speculative;
-        defaults.n_keep      = params.n_keep;
-        defaults.n_predict   = params.n_predict;
-        defaults.antiprompt  = params.antiprompt;
-
-        defaults.stream = true; // make sure we always use streaming mode
-        defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way
-        // defaults.return_progress = true; // TODO: show progress
-
-        verbose_prompt = params.verbose_prompt;
-    }
-
-    std::string generate_completion(result_timings & out_timings) {
-        server_response_reader rd = ctx_server.get_response_reader();
-        auto chat_params = format_chat();
-        {
-            // TODO: reduce some copies here in the future
-            server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
-            task.id         = rd.get_new_id();
-            task.index      = 0;
-            task.params     = defaults;           // copy
-            task.cli_prompt = chat_params.prompt; // copy
-            task.cli_files  = input_files;        // copy
-            task.cli        = true;
-
-            // chat template settings
-            task.params.chat_parser_params = common_chat_parser_params(chat_params);
-            task.params.chat_parser_params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
-            if (!chat_params.parser.empty()) {
-                task.params.chat_parser_params.parser.load(chat_params.parser);
-            }
-
-            // reasoning budget sampler
-            if (!chat_params.thinking_end_tag.empty()) {
-                const llama_vocab * vocab = llama_model_get_vocab(
-                    llama_get_model(ctx_server.get_llama_context()));
-
-                task.params.sampling.reasoning_budget_tokens = defaults.sampling.reasoning_budget_tokens;
-                task.params.sampling.generation_prompt = chat_params.generation_prompt;
-
-                if (!chat_params.thinking_start_tag.empty()) {
-                    task.params.sampling.reasoning_budget_start =
-                        common_tokenize(vocab, chat_params.thinking_start_tag, false, true);
-                }
-                task.params.sampling.reasoning_budget_end =
-                    common_tokenize(vocab, chat_params.thinking_end_tag, false, true);
-                task.params.sampling.reasoning_budget_forced =
-                    common_tokenize(vocab, defaults.sampling.reasoning_budget_message + chat_params.thinking_end_tag, false, true);
-            }
-
-            rd.post_task({std::move(task)});
-        }
-
-        if (verbose_prompt) {
-            console::set_display(DISPLAY_TYPE_PROMPT);
-            console::log("%s\n\n", chat_params.prompt.c_str());
-            console::set_display(DISPLAY_TYPE_RESET);
-        }
-
-        // wait for first result
-        console::spinner::start();
-        server_task_result_ptr result = rd.next(should_stop);
-
-        while (true) {
-            auto res_partial = dynamic_cast<server_task_result_cmpl_partial *>(result.get());
-            if (res_partial && res_partial->is_begin) {
-                // this is the "send 200 status to client" signal in streaming mode
-                // skip, do not stop the spinner
-                result = rd.next(should_stop);
-            } else {
-                console::spinner::stop();
-                break;
-            }
-        }
-
-        std::string curr_content;
-        bool is_thinking = false;
-
-        while (result) {
-            if (should_stop()) {
-                break;
-            }
-            if (result->is_error()) {
-                json err_data = result->to_json();
-                if (err_data.contains("message")) {
-                    console::error("Error: %s\n", err_data["message"].get<std::string>().c_str());
-                } else {
-                    console::error("Error: %s\n", err_data.dump().c_str());
-                }
-                return curr_content;
-            }
-            auto res_partial = dynamic_cast<server_task_result_cmpl_partial *>(result.get());
-            if (res_partial) {
-                out_timings = std::move(res_partial->timings);
-                for (const auto & diff : res_partial->oaicompat_msg_diffs) {
-                    if (!diff.content_delta.empty()) {
-                        if (is_thinking) {
-                            console::log("\n[End thinking]\n\n");
-                            console::set_display(DISPLAY_TYPE_RESET);
-                            is_thinking = false;
-                        }
-                        curr_content += diff.content_delta;
-                        console::log("%s", diff.content_delta.c_str());
-                        console::flush();
-                    }
-                    if (!diff.reasoning_content_delta.empty()) {
-                        console::set_display(DISPLAY_TYPE_REASONING);
-                        if (!is_thinking) {
-                            console::log("[Start thinking]\n");
-                        }
-                        is_thinking = true;
-                        console::log("%s", diff.reasoning_content_delta.c_str());
-                        console::flush();
-                    }
-                }
-            }
-            auto res_final = dynamic_cast<server_task_result_cmpl_final *>(result.get());
-            if (res_final) {
-                out_timings = std::move(res_final->timings);
-                break;
-            }
-            result = rd.next(should_stop);
-        }
-        g_is_interrupted.store(false);
-        // server_response_reader automatically cancels pending tasks upon destruction
-        return curr_content;
-    }
-
-    // TODO: support remote files in the future (http, https, etc)
-    std::string load_input_file(const std::string & fname, bool is_media) {
-        std::ifstream file(fname, std::ios::binary);
-        if (!file) {
-            return "";
-        }
-        if (is_media) {
-            raw_buffer buf;
-            buf.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-            input_files.push_back(std::move(buf));
-            return get_media_marker();
-        } else {
-            std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-            return content;
-        }
-    }
-
-    common_chat_params format_chat() {
-        auto meta = ctx_server.get_meta();
-        auto & chat_params = meta.chat_params;
-
-        auto caps = common_chat_templates_get_caps(chat_params.tmpls.get());
-
-        common_chat_templates_inputs inputs;
-        inputs.messages              = common_chat_msgs_parse_oaicompat(messages);
-        inputs.tools                 = {}; // TODO
-        inputs.tool_choice           = COMMON_CHAT_TOOL_CHOICE_NONE;
-        inputs.json_schema           = ""; // TODO
-        inputs.grammar               = ""; // TODO
-        inputs.use_jinja             = chat_params.use_jinja;
-        inputs.parallel_tool_calls   = caps["supports_parallel_tool_calls"];
-        inputs.add_generation_prompt = true;
-        inputs.reasoning_format      = COMMON_REASONING_FORMAT_DEEPSEEK;
-        inputs.force_pure_content    = chat_params.force_pure_content;
-        inputs.enable_thinking       = chat_params.enable_thinking ? common_chat_templates_support_enable_thinking(chat_params.tmpls.get()) : false;
-
-        // Apply chat template to the list of messages
-        return common_chat_templates_apply(chat_params.tmpls.get(), inputs);
-    }
-};
-
 // TODO?: Make this reusable, enums, docs
 static const std::array<std::string_view, 8> cmds = {
     "/audio ",
@@ -352,7 +150,8 @@ static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std:
     return matches;
 }
 
-static constexpr size_t FILE_GLOB_MAX_RESULTS = 100;
+// the view must outlive atexit()
+static cli_view_console g_view;
 
 // satisfies -Wmissing-declarations
 int llama_cli(int argc, char ** argv);
@@ -370,22 +169,15 @@ int llama_cli(int argc, char ** argv) {
 
     // TODO: maybe support it later?
     if (params.conversation_mode == COMMON_CONVERSATION_MODE_DISABLED) {
-        console::error("--no-conversation is not supported by llama-cli\n");
-        console::error("please use llama-completion instead\n");
+        g_view.print_error("--no-conversation is not supported by llama-cli\n");
+        g_view.print_error("please use llama-completion instead\n");
     }
 
-    // struct that contains llama context and inference
-    cli_context ctx_cli(params);
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
     // TODO: avoid using atexit() here by making `console` a singleton
-    console::init(params.simple_io, params.use_color);
-    atexit([]() { console::cleanup(); });
+    g_view.init(params.simple_io, params.use_color);
+    atexit([]() { g_view.cleanup(); });
 
-    console::set_display(DISPLAY_TYPE_RESET);
-    console::set_completion_callback(auto_completion_callback);
+    g_view.set_completion_callback(auto_completion_callback);
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
     struct sigaction sigint_action;
@@ -401,273 +193,16 @@ int llama_cli(int argc, char ** argv) {
     SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
 
-    console::log("\nLoading model... "); // followed by loading animation
-    console::spinner::start();
-    if (!ctx_cli.ctx_server.load_model(params)) {
-        console::spinner::stop();
-        console::error("\nFailed to load the model\n");
-        return 1;
-    }
-
-    ctx_cli.defaults.sampling = params.sampling;
-
-    console::spinner::stop();
-    console::log("\n");
-
-    std::thread inference_thread([&ctx_cli]() {
-        ctx_cli.ctx_server.start_loop();
-    });
-
-    auto inf = ctx_cli.ctx_server.get_meta();
-    std::string modalities = "text";
-    if (inf.has_inp_image) {
-        modalities += ", vision";
-    }
-    if (inf.has_inp_audio) {
-        modalities += ", audio";
-    }
-
-    auto add_system_prompt = [&]() {
-        if (!params.system_prompt.empty()) {
-            ctx_cli.messages.push_back({
-                {"role",    "system"},
-                {"content", params.system_prompt}
-            });
-        }
-    };
-    add_system_prompt();
-
-    console::log("\n");
-    console::log("%s\n", LLAMA_ASCII_LOGO);
-    console::log("build      : %s\n", inf.build_info.c_str());
-    console::log("model      : %s\n", inf.model_name.c_str());
-    console::log("modalities : %s\n", modalities.c_str());
-    if (!params.system_prompt.empty()) {
-        console::log("using custom system prompt\n");
-    }
-    console::log("\n");
-    console::log("available commands:\n");
-    console::log("  /exit or Ctrl+C     stop or exit\n");
-    console::log("  /regen              regenerate the last response\n");
-    console::log("  /clear              clear the chat history\n");
-    console::log("  /read <file>        add a text file\n");
-    console::log("  /glob <pattern>     add text files using globbing pattern\n");
-    if (inf.has_inp_image) {
-        console::log("  /image <file>       add an image file\n");
-    }
-    if (inf.has_inp_audio) {
-        console::log("  /audio <file>       add an audio file\n");
-    }
-    if (inf.has_inp_video) {
-        console::log("  /video <file>       add a video file\n");
-    }
-    console::log("\n");
-
-    // interactive loop
-    std::string cur_msg;
-
-    auto add_text_file = [&](const std::string & fname) -> bool {
-        std::string marker = ctx_cli.load_input_file(fname, false);
-        if (marker.empty()) {
-            console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
-            return false;
-        }
-        if (inf.fim_sep_token != LLAMA_TOKEN_NULL) {
-            cur_msg += common_token_to_piece(ctx_cli.ctx_server.get_llama_context(), inf.fim_sep_token, true);
-            cur_msg += fname;
-            cur_msg.push_back('\n');
-        } else {
-            cur_msg += "--- File: ";
-            cur_msg += fname;
-            cur_msg += " ---\n";
-        }
-        cur_msg += marker;
-        console::log("Loaded text from '%s'\n", fname.c_str());
-        return true;
-    };
-
-    while (true) {
-        std::string buffer;
-        console::set_display(DISPLAY_TYPE_USER_INPUT);
-        if (params.prompt.empty()) {
-            console::log("\n> ");
-            std::string line;
-            bool another_line = true;
-            do {
-                another_line = console::readline(line, params.multiline_input);
-                buffer += line;
-            } while (another_line);
-        } else {
-            // process input prompt from args
-            for (auto & fname : params.image) {
-                std::string marker = ctx_cli.load_input_file(fname, true);
-                if (marker.empty()) {
-                    console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
-                    break;
-                }
-                console::log("Loaded media from '%s'\n", fname.c_str());
-                cur_msg += marker;
-            }
-            buffer = params.prompt;
-            if (buffer.size() > 500) {
-                console::log("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str());
-            } else {
-                console::log("\n> %s\n", buffer.c_str());
-            }
-            params.prompt.clear(); // only use it once
-        }
-        console::set_display(DISPLAY_TYPE_RESET);
-        console::log("\n");
-
-        if (should_stop()) {
-            g_is_interrupted.store(false);
-            break;
-        }
-
-        // remove trailing newline
-        if (!buffer.empty() &&buffer.back() == '\n') {
-            buffer.pop_back();
-        }
-
-        // skip empty messages
-        if (buffer.empty()) {
-            continue;
-        }
-
-        bool add_user_msg = true;
-
-        // process commands
-        if (string_starts_with(buffer, "/exit")) {
-            break;
-        } else if (string_starts_with(buffer, "/regen")) {
-            if (ctx_cli.messages.size() >= 2) {
-                size_t last_idx = ctx_cli.messages.size() - 1;
-                ctx_cli.messages.erase(last_idx);
-                add_user_msg = false;
-            } else {
-                console::error("No message to regenerate.\n");
-                continue;
-            }
-        } else if (string_starts_with(buffer, "/clear")) {
-            ctx_cli.messages.clear();
-            add_system_prompt();
-
-            ctx_cli.input_files.clear();
-            console::log("Chat history cleared.\n");
-            continue;
-        } else if (
-                (string_starts_with(buffer, "/image ") && inf.has_inp_image) ||
-                (string_starts_with(buffer, "/audio ") && inf.has_inp_audio) ||
-                (string_starts_with(buffer, "/video ") && inf.has_inp_video)) {
-            // just in case (bad copy-paste for example), we strip all trailing/leading spaces
-            std::string fname = string_strip(buffer.substr(7));
-            std::string marker = ctx_cli.load_input_file(fname, true);
-            if (marker.empty()) {
-                console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
-                continue;
-            }
-            cur_msg += marker;
-            console::log("Loaded media from '%s'\n", fname.c_str());
-            continue;
-        } else if (string_starts_with(buffer, "/read ")) {
-            std::string fname = string_strip(buffer.substr(6));
-            add_text_file(fname);
-            continue;
-        } else if (string_starts_with(buffer, "/glob ")) {
-            std::error_code ec;
-            size_t count = 0;
-            auto curdir = std::filesystem::current_path();
-            std::string pattern = string_strip(buffer.substr(6));
-            std::filesystem::path rel_path;
-
-            auto startglob = pattern.find_first_of("![*?");
-            if (startglob != std::string::npos && startglob != 0) {
-                auto endpath = pattern.substr(0, startglob).find_last_of('/');
-                if (endpath != std::string::npos) {
-                    std::string rel_pattern = pattern.substr(0, endpath);
-#if !defined(_WIN32)
-                    if (string_starts_with(rel_pattern, '~')) {
-                        const char * home = std::getenv("HOME");
-                        if (home && home[0]) {
-                            rel_pattern = home + rel_pattern.substr(1);
-                        }
-                    }
-#endif
-                    rel_path = rel_pattern;
-                    pattern.erase(0, endpath + 1);
-                    curdir /= rel_path;
-                }
-            }
-
-            for (const auto & entry : std::filesystem::recursive_directory_iterator(curdir,
-                    std::filesystem::directory_options::skip_permission_denied, ec)) {
-                if (!entry.is_regular_file()) {
-                    continue;
-                }
-
-                std::string rel = std::filesystem::relative(entry.path(), curdir, ec).string();
-                if (ec) {
-                    ec.clear();
-                    continue;
-                }
-                std::replace(rel.begin(), rel.end(), '\\', '/');
-
-                if (!glob_match(pattern, rel)) {
-                    continue;
-                }
-
-                if (!add_text_file((rel_path / rel).string())) {
-                    continue;
-                }
-
-                if (++count >= FILE_GLOB_MAX_RESULTS) {
-                    console::error("Maximum number of globbed files allowed (%zu) reached.\n", FILE_GLOB_MAX_RESULTS);
-                    break;
-                }
-            }
-            continue;
-        } else {
-            // not a command
-            cur_msg += buffer;
-        }
-
-        // generate response
-        if (add_user_msg) {
-            ctx_cli.messages.push_back({
-                {"role",    "user"},
-                {"content", cur_msg}
-            });
-            cur_msg.clear();
-        }
-        result_timings timings;
-        std::string assistant_content = ctx_cli.generate_completion(timings);
-        ctx_cli.messages.push_back({
-            {"role",    "assistant"},
-            {"content", assistant_content}
-        });
-        console::log("\n");
+    cli_context ctx_cli(params, g_view);
 
-        if (params.show_timings) {
-            console::set_display(DISPLAY_TYPE_INFO);
-            console::log("\n");
-            console::log("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second);
-            console::set_display(DISPLAY_TYPE_RESET);
-        }
-
-        if (params.single_turn) {
-            break;
-        }
+    if (!ctx_cli.init(argc, argv)) {
+        ctx_cli.shutdown();
+        return 1;
     }
 
-    console::set_display(DISPLAY_TYPE_RESET);
-
-    console::log("\nExiting...\n");
-    ctx_cli.ctx_server.terminate();
-    inference_thread.join();
+    int ret = ctx_cli.run();
 
-    // bump the log level to display timings
-    common_log_set_verbosity_thold(LOG_LEVEL_INFO);
-    common_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context());
+    ctx_cli.shutdown();
 
-    return 0;
+    return ret;
 }
diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt
index 7d427431db93..f5b499c57f30 100644
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@@ -57,3 +57,8 @@ install(TARGETS ${TARGET} RUNTIME)
 
 target_link_libraries(${TARGET} PRIVATE llama-server-impl)
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
+
+if (TARGET llama-cli)
+    # llama-cli spawns llama-server at runtime when no --server-base is given
+    add_dependencies(llama-cli llama-server)
+endif()

From 1a8faba7fabc8898137b5abe811e3c80198209b0 Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Sat, 13 Jun 2026 00:35:42 +0200
Subject: [PATCH 2/2] cli : move presentation logic from the controller into
 the view

The controller now only reports semantic events and data objects
(loading started, server info, response began, reasoning/content
deltas, timings, messages and errors); the view decides how to present
them: spinner handling, thinking markers, banner layout, command list
alignment, prompt markers, echo truncation and timing formats all live
in cli_view_console now.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 tools/cli/cli-context.cpp | 160 ++++++++++++--------------------------
 tools/cli/cli-context.h   |   5 --
 tools/cli/cli-view.cpp    | 151 +++++++++++++++++++++++++++++++----
 tools/cli/cli-view.h      |  79 +++++++++++++++----
 tools/cli/cli.cpp         |   4 +-
 5 files changed, 250 insertions(+), 149 deletions(-)

diff --git a/tools/cli/cli-context.cpp b/tools/cli/cli-context.cpp
index fdb7c0c38f4e..e2a098a07f78 100644
--- a/tools/cli/cli-context.cpp
+++ b/tools/cli/cli-context.cpp
@@ -10,16 +10,6 @@
 #include <map>
 #include <set>
 
-static const char * LLAMA_ASCII_LOGO = R"(
-▄▄ ▄▄
-██ ██
-██ ██  ▀▀█▄ ███▄███▄  ▀▀█▄    ▄████ ████▄ ████▄
-██ ██ ▄█▀██ ██ ██ ██ ▄█▀██    ██    ██ ██ ██ ██
-██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀
-                                    ██    ██
-                                    ▀▀    ▀▀
-)";
-
 std::atomic<bool> g_cli_interrupted = false;
 
 static bool should_stop() {
@@ -119,34 +109,30 @@ bool cli_context::init(int argc, char ** argv) {
         }
         client.server_base = base;
 
-        view.print("Connecting to " + client.server_base + " ... ");
-        view.spinner_start();
+        view.show_loading("Connecting to " + client.server_base);
     } else {
         if (params.model.path.empty() && params.model.url.empty() &&
                 params.model.hf_repo.empty() && params.model.docker_repo.empty()) {
-            view.print_error("no model specified\n");
-            view.print("use -m <file.gguf> or -hf <user/repo> to run a local model,\n"
-                       "or --server-base <url> to connect to a running llama-server\n");
+            view.show_error("no model specified");
+            view.show_message("use -m <file.gguf> or -hf <user/repo> to run a local model,\n"
+                              "or --server-base <url> to connect to a running llama-server");
             return false;
         }
 
         const bool pass_output = params.verbosity >= LOG_LEVEL_INFO;
 
-        view.print("Loading model... ");
-        view.spinner_start();
+        view.show_loading("Loading model");
 
         server.emplace();
         if (!server->start(filter_server_args(argc, argv), pass_output)) {
-            view.spinner_stop();
-            view.print_error("\n" + server->last_error + "\n");
+            view.show_error(server->last_error);
             return false;
         }
         if (!server->wait_ready(should_stop)) {
-            view.spinner_stop();
             if (!should_stop()) {
-                view.print_error("\nthe server exited before becoming ready\n");
+                view.show_error("the server exited before becoming ready");
                 if (!pass_output) {
-                    view.print(server->recent_output());
+                    view.show_message(server->recent_output());
                 }
             }
             return false;
@@ -166,17 +152,15 @@ bool cli_context::init(int argc, char ** argv) {
         client.last_error = e.what();
     }
     if (!healthy) {
-        view.spinner_stop();
         if (!should_stop()) {
-            view.print_error("\n" + client.last_error + "\n");
+            view.show_error(client.last_error);
         }
         return false;
     }
 
     fetch_server_props();
 
-    view.spinner_stop();
-    view.print("\n");
+    view.hide_loading();
 
     return true;
 }
@@ -277,24 +261,14 @@ bool cli_context::generate_completion(std::string & assistant_content, cli_timin
         {"timings_per_token", true},
     };
 
-    bool is_thinking   = false;
-    bool spinner_alive = true;
-    bool stream_error  = false;
+    bool stream_error = false;
 
-    auto stop_spinner = [&]() {
-        if (spinner_alive) {
-            spinner_alive = false;
-            view.spinner_stop();
-        }
-    };
-
-    view.spinner_start();
+    view.begin_response();
 
     json err = client.create_chat_completion(body, should_stop, [&](const json & chunk) {
         if (chunk.contains("error")) {
-            stop_spinner();
             stream_error = true;
-            view.print_error("Error: " + format_error_message(chunk) + "\n");
+            view.show_error(format_error_message(chunk));
             return;
         }
         if (chunk.contains("timings")) {
@@ -313,83 +287,57 @@ bool cli_context::generate_completion(std::string & assistant_content, cli_timin
         if (delta.contains("reasoning_content") && delta.at("reasoning_content").is_string()) {
             const std::string text = delta.at("reasoning_content").get<std::string>();
             if (!text.empty()) {
-                stop_spinner();
-                if (!is_thinking) {
-                    view.print_reasoning("[Start thinking]\n");
-                    is_thinking = true;
-                }
-                view.print_reasoning(text);
-                view.flush();
+                view.on_reasoning_delta(text);
             }
         }
         if (delta.contains("content") && delta.at("content").is_string()) {
             const std::string text = delta.at("content").get<std::string>();
             if (!text.empty()) {
-                stop_spinner();
-                if (is_thinking) {
-                    view.print_reasoning("\n[End thinking]\n\n");
-                    is_thinking = false;
-                }
                 assistant_content += text;
-                view.print(text);
-                view.flush();
+                view.on_content_delta(text);
             }
         }
     });
 
-    stop_spinner();
+    view.end_response();
     g_cli_interrupted.store(false);
 
     if (!err.is_null()) {
-        view.print_error("Error: " + format_error_message(err) + "\n");
+        view.show_error(format_error_message(err));
         return false;
     }
     return !stream_error;
 }
 
 int cli_context::run() {
-    std::string modalities = "text";
-    if (has_vision) {
-        modalities += ", vision";
-    }
-    if (has_audio) {
-        modalities += ", audio";
-    }
-    if (has_video) {
-        modalities += ", video";
-    }
-
     add_system_prompt();
 
-    view.print("\n");
-    view.print(LLAMA_ASCII_LOGO);
-    view.print("\n");
-    if (!build_info.empty()) {
-        view.print(string_format("build      : %s\n", build_info.c_str()));
-    }
-    view.print(string_format("model      : %s\n", model_name.empty() ? "(unknown)" : model_name.c_str()));
-    view.print(string_format("server     : %s%s\n", client.server_base.c_str(), server ? " (managed by llama-cli)" : ""));
-    view.print(string_format("modalities : %s\n", modalities.c_str()));
-    if (!params.system_prompt.empty()) {
-        view.print("using custom system prompt\n");
-    }
-    view.print("\n");
-    view.print("available commands:\n");
-    view.print("  /exit or Ctrl+C     stop or exit\n");
-    view.print("  /regen              regenerate the last response\n");
-    view.print("  /clear              clear the chat history\n");
-    view.print("  /read <file>        add a text file\n");
-    view.print("  /glob <pattern>     add text files using globbing pattern\n");
+    cli_server_info info;
+    info.build_info        = build_info;
+    info.model_name        = model_name;
+    info.server_base       = client.server_base;
+    info.is_local_server   = server.has_value();
+    info.has_system_prompt = !params.system_prompt.empty();
+    info.has_vision        = has_vision;
+    info.has_audio         = has_audio;
+    info.has_video         = has_video;
+    info.commands = {
+        {"/exit or Ctrl+C", "stop or exit"},
+        {"/regen",          "regenerate the last response"},
+        {"/clear",          "clear the chat history"},
+        {"/read <file>",    "add a text file"},
+        {"/glob <pattern>", "add text files using globbing pattern"},
+    };
     if (has_vision) {
-        view.print("  /image <file>       add an image file\n");
+        info.commands.push_back({"/image <file>", "add an image file"});
     }
     if (has_audio) {
-        view.print("  /audio <file>       add an audio file\n");
+        info.commands.push_back({"/audio <file>", "add an audio file"});
     }
     if (has_video) {
-        view.print("  /video <file>       add a video file\n");
+        info.commands.push_back({"/video <file>", "add a video file"});
     }
-    view.print("\n");
+    view.show_banner(info);
 
     // interactive loop
     std::string cur_msg;
@@ -397,7 +345,7 @@ int cli_context::run() {
     auto add_text_file = [&](const std::string & fname) -> bool {
         std::ifstream file(fname, std::ios::binary);
         if (!file) {
-            view.print_error(string_format("file does not exist or cannot be opened: '%s'\n", fname.c_str()));
+            view.show_error(string_format("file does not exist or cannot be opened: '%s'", fname.c_str()));
             return false;
         }
         std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
@@ -405,14 +353,14 @@ int cli_context::run() {
         cur_msg += fname;
         cur_msg += " ---\n";
         cur_msg += content;
-        view.print(string_format("Loaded text from '%s'\n", fname.c_str()));
+        view.show_message(string_format("Loaded text from '%s'", fname.c_str()));
         return true;
     };
 
     while (true) {
         std::string buffer;
         if (params.prompt.empty()) {
-            view.print_user("\n> ");
+            view.prompt_user();
             std::string line;
             bool another_line = true;
             do {
@@ -423,20 +371,16 @@ int cli_context::run() {
             // process input prompt from args
             for (auto & fname : params.image) {
                 if (!stage_media_file(fname, media_type_from_ext(fname))) {
-                    view.print_error(string_format("file does not exist or cannot be opened: '%s'\n", fname.c_str()));
+                    view.show_error(string_format("file does not exist or cannot be opened: '%s'", fname.c_str()));
                     break;
                 }
-                view.print(string_format("Loaded media from '%s'\n", fname.c_str()));
+                view.show_message(string_format("Loaded media from '%s'", fname.c_str()));
             }
             buffer = params.prompt;
-            if (buffer.size() > 500) {
-                view.print_user(string_format("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str()));
-            } else {
-                view.print_user(string_format("\n> %s\n", buffer.c_str()));
-            }
+            view.echo_user(buffer);
             params.prompt.clear(); // only use it once
         }
-        view.print("\n");
+        view.end_user_input();
 
         if (should_stop()) {
             g_cli_interrupted.store(false);
@@ -464,7 +408,7 @@ int cli_context::run() {
                 messages.erase(last_idx);
                 add_user_msg = false;
             } else {
-                view.print_error("No message to regenerate.\n");
+                view.show_error("No message to regenerate.");
                 continue;
             }
         } else if (string_starts_with(buffer, "/clear")) {
@@ -472,7 +416,7 @@ int cli_context::run() {
             add_system_prompt();
 
             pending_media = json::array();
-            view.print("Chat history cleared.\n");
+            view.show_message("Chat history cleared.");
             continue;
         } else if (
                 (string_starts_with(buffer, "/image ") && has_vision) ||
@@ -482,10 +426,10 @@ int cli_context::run() {
             // just in case (bad copy-paste for example), we strip all trailing/leading spaces
             std::string fname = string_strip(buffer.substr(7));
             if (!stage_media_file(fname, type)) {
-                view.print_error(string_format("file does not exist or cannot be opened: '%s'\n", fname.c_str()));
+                view.show_error(string_format("file does not exist or cannot be opened: '%s'", fname.c_str()));
                 continue;
             }
-            view.print(string_format("Loaded media from '%s'\n", fname.c_str()));
+            view.show_message(string_format("Loaded media from '%s'", fname.c_str()));
             continue;
         } else if (string_starts_with(buffer, "/read ")) {
             std::string fname = string_strip(buffer.substr(6));
@@ -539,7 +483,7 @@ int cli_context::run() {
                 }
 
                 if (++count >= FILE_GLOB_MAX_RESULTS) {
-                    view.print_error(string_format("Maximum number of globbed files allowed (%zu) reached.\n", FILE_GLOB_MAX_RESULTS));
+                    view.show_error(string_format("Maximum number of globbed files allowed (%zu) reached.", FILE_GLOB_MAX_RESULTS));
                     break;
                 }
             }
@@ -561,11 +505,9 @@ int cli_context::run() {
             {"role",    "assistant"},
             {"content", assistant_content}
         });
-        view.print("\n");
 
         if (params.show_timings) {
-            view.print_info(string_format("\n[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n",
-                    timings.prompt_per_second, timings.predicted_per_second));
+            view.show_timings(timings);
         }
 
         if (params.single_turn) {
@@ -573,7 +515,7 @@ int cli_context::run() {
         }
     }
 
-    view.print("\nExiting...\n");
+    view.show_message("Exiting...");
 
     return 0;
 }
diff --git a/tools/cli/cli-context.h b/tools/cli/cli-context.h
index a7f725ee76f6..20cddcdefad9 100644
--- a/tools/cli/cli-context.h
+++ b/tools/cli/cli-context.h
@@ -19,11 +19,6 @@
 // set by the SIGINT handler; cleared once the interrupt has been handled
 extern std::atomic<bool> g_cli_interrupted;
 
-struct cli_timings {
-    double prompt_per_second    = 0.0;
-    double predicted_per_second = 0.0;
-};
-
 struct cli_context {
     common_params params;
 
diff --git a/tools/cli/cli-view.cpp b/tools/cli/cli-view.cpp
index b1e41174a593..dbd9ab904cb6 100644
--- a/tools/cli/cli-view.cpp
+++ b/tools/cli/cli-view.cpp
@@ -1,5 +1,17 @@
 #include "cli-view.h"
 
+#include <algorithm>
+
+static const char * LLAMA_ASCII_LOGO = R"(
+▄▄ ▄▄
+██ ██
+██ ██  ▀▀█▄ ███▄███▄  ▀▀█▄    ▄████ ████▄ ████▄
+██ ██ ▄█▀██ ██ ██ ██ ▄█▀██    ██    ██ ██ ██ ██
+██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀
+                                    ██    ██
+                                    ▀▀    ▀▀
+)";
+
 void cli_view_console::init(bool simple_io, bool use_color) {
     console::init(simple_io, use_color);
     console::set_display(DISPLAY_TYPE_RESET);
@@ -25,39 +37,146 @@ void cli_view_console::set_display(display_type display) {
     }
 }
 
-void cli_view_console::print(const std::string & text) {
+void cli_view_console::stop_spinner() {
+    if (is_busy) {
+        is_busy = false;
+        console::spinner::stop();
+    }
+}
+
+void cli_view_console::show_loading(const std::string & message) {
     set_display(DISPLAY_TYPE_RESET);
-    console::log("%s", text.c_str());
+    console::log("%s... ", message.c_str());
+    if (!is_busy) {
+        is_busy = true;
+        console::spinner::start();
+    }
 }
 
-void cli_view_console::print_reasoning(const std::string & text) {
-    set_display(DISPLAY_TYPE_REASONING);
-    console::log("%s", text.c_str());
+void cli_view_console::hide_loading() {
+    stop_spinner();
+    console::log("\n");
+}
+
+void cli_view_console::show_banner(const cli_server_info & info) {
+    set_display(DISPLAY_TYPE_RESET);
+    console::log("\n");
+    console::log("%s\n", LLAMA_ASCII_LOGO);
+    if (!info.build_info.empty()) {
+        console::log("build      : %s\n", info.build_info.c_str());
+    }
+    console::log("model      : %s\n", info.model_name.empty() ? "(unknown)" : info.model_name.c_str());
+    console::log("server     : %s%s\n", info.server_base.c_str(), info.is_local_server ? " (managed by llama-cli)" : "");
+
+    std::string modalities = "text";
+    if (info.has_vision) {
+        modalities += ", vision";
+    }
+    if (info.has_audio) {
+        modalities += ", audio";
+    }
+    if (info.has_video) {
+        modalities += ", video";
+    }
+    console::log("modalities : %s\n", modalities.c_str());
+
+    if (info.has_system_prompt) {
+        console::log("using custom system prompt\n");
+    }
+
+    if (!info.commands.empty()) {
+        size_t width = 0;
+        for (const auto & cmd : info.commands) {
+            width = std::max(width, cmd.usage.size());
+        }
+        console::log("\n");
+        console::log("available commands:\n");
+        for (const auto & cmd : info.commands) {
+            console::log("  %-*s    %s\n", (int) width, cmd.usage.c_str(), cmd.description.c_str());
+        }
+    }
+    console::log("\n");
+}
+
+void cli_view_console::show_message(const std::string & text) {
+    if (is_busy) {
+        // break the pending loading line
+        stop_spinner();
+        console::log("\n");
+    }
+    set_display(DISPLAY_TYPE_RESET);
+    console::log("%s\n", text.c_str());
+}
+
+void cli_view_console::show_error(const std::string & message) {
+    if (is_busy) {
+        // break the pending loading line
+        stop_spinner();
+        console::log("\n");
+    }
+    console::error("%s\n", message.c_str()); // restores the current display on its own
 }
 
-void cli_view_console::print_info(const std::string & text) {
+void cli_view_console::show_timings(const cli_timings & timings) {
     set_display(DISPLAY_TYPE_INFO);
-    console::log("%s", text.c_str());
+    console::log("\n[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n",
+            timings.prompt_per_second, timings.predicted_per_second);
     set_display(DISPLAY_TYPE_RESET);
 }
 
-void cli_view_console::print_user(const std::string & text) {
+void cli_view_console::prompt_user() {
     set_display(DISPLAY_TYPE_USER_INPUT);
-    console::log("%s", text.c_str());
+    console::log("\n> ");
 }
 
-void cli_view_console::print_error(const std::string & text) {
-    console::error("%s", text.c_str()); // restores the current display on its own
+void cli_view_console::echo_user(const std::string & text) {
+    static constexpr size_t MAX_ECHO_LENGTH = 500;
+    set_display(DISPLAY_TYPE_USER_INPUT);
+    if (text.size() > MAX_ECHO_LENGTH) {
+        console::log("\n> %s ... (truncated)\n", text.substr(0, MAX_ECHO_LENGTH).c_str());
+    } else {
+        console::log("\n> %s\n", text.c_str());
+    }
 }
 
-void cli_view_console::spinner_start() {
-    console::spinner::start();
+void cli_view_console::end_user_input() {
+    set_display(DISPLAY_TYPE_RESET);
+    console::log("\n");
 }
 
-void cli_view_console::spinner_stop() {
-    console::spinner::stop();
+void cli_view_console::begin_response() {
+    if (!is_busy) {
+        is_busy = true;
+        console::spinner::start();
+    }
 }
 
-void cli_view_console::flush() {
+void cli_view_console::on_reasoning_delta(const std::string & text) {
+    stop_spinner();
+    set_display(DISPLAY_TYPE_REASONING);
+    if (!is_thinking) {
+        is_thinking = true;
+        console::log("[Start thinking]\n");
+    }
+    console::log("%s", text.c_str());
     console::flush();
 }
+
+void cli_view_console::on_content_delta(const std::string & text) {
+    stop_spinner();
+    if (is_thinking) {
+        is_thinking = false;
+        set_display(DISPLAY_TYPE_REASONING);
+        console::log("\n[End thinking]\n\n");
+    }
+    set_display(DISPLAY_TYPE_RESET);
+    console::log("%s", text.c_str());
+    console::flush();
+}
+
+void cli_view_console::end_response() {
+    stop_spinner();
+    is_thinking = false;
+    set_display(DISPLAY_TYPE_RESET);
+    console::log("\n");
+}
diff --git a/tools/cli/cli-view.h b/tools/cli/cli-view.h
index 345f61799e52..a69cd8d32f9a 100644
--- a/tools/cli/cli-view.h
+++ b/tools/cli/cli-view.h
@@ -1,6 +1,8 @@
 // view layer for llama-cli (the "view" in MVC)
 //
-// the view owns all user-facing input/output; it knows nothing about HTTP,
+// the view owns all user-facing input/output; the controller only reports
+// semantic events and data objects, and the view decides how to present
+// them (styles, markers, spinners, layout); it knows nothing about HTTP,
 // process management or chat state
 
 #pragma once
@@ -12,6 +14,30 @@
 #include <utility>
 #include <vector>
 
+struct cli_timings {
+    double prompt_per_second    = 0.0;
+    double predicted_per_second = 0.0;
+};
+
+struct cli_command_info {
+    std::string usage;       // e.g. "/read <file>"
+    std::string description; // e.g. "add a text file"
+};
+
+// properties of the connected server, shown on startup
+struct cli_server_info {
+    std::string build_info;
+    std::string model_name;
+    std::string server_base;
+    bool is_local_server   = false; // server is spawned and managed by llama-cli
+    bool has_system_prompt = false;
+    bool has_vision        = false;
+    bool has_audio         = false;
+    bool has_video         = false;
+
+    std::vector<cli_command_info> commands;
+};
+
 struct cli_view {
     // returns matches as (replacement line, cursor position)
     using completion_callback = std::function<std::vector<std::pair<std::string, size_t>>(std::string_view, size_t)>;
@@ -21,19 +47,29 @@ struct cli_view {
     virtual void init(bool simple_io, bool use_color) = 0;
     virtual void cleanup() = 0;
 
+    // input
     // read a line from the user; returns true if the input continues on another line
     virtual bool readline(std::string & line, bool multiline_input) = 0;
     virtual void set_completion_callback(completion_callback cb) = 0;
 
-    virtual void print(const std::string & text) = 0;           // assistant / generic output
-    virtual void print_reasoning(const std::string & text) = 0; // reasoning stream
-    virtual void print_info(const std::string & text) = 0;      // metadata (banner, timings)
-    virtual void print_user(const std::string & text) = 0;      // user input marker / echo
-    virtual void print_error(const std::string & text) = 0;
+    // generic events
+    virtual void show_loading(const std::string & message) = 0; // enter a busy state
+    virtual void hide_loading() = 0;                            // leave the busy state
+    virtual void show_banner(const cli_server_info & info) = 0;
+    virtual void show_message(const std::string & text) = 0;    // discrete informational message
+    virtual void show_error(const std::string & message) = 0;
+    virtual void show_timings(const cli_timings & timings) = 0;
 
-    virtual void spinner_start() = 0;
-    virtual void spinner_stop() = 0;
-    virtual void flush() = 0;
+    // user input flow
+    virtual void prompt_user() = 0;                       // interactive input starts
+    virtual void echo_user(const std::string & text) = 0; // non-interactive input (e.g. from -p)
+    virtual void end_user_input() = 0;                    // input finished, output follows
+
+    // assistant response flow
+    virtual void begin_response() = 0;                             // waiting for the first token
+    virtual void on_reasoning_delta(const std::string & text) = 0; // streamed reasoning fragment
+    virtual void on_content_delta(const std::string & text) = 0;   // streamed content fragment
+    virtual void end_response() = 0;                               // response finished (or aborted)
 };
 
 // cli_view implementation backed by common/console
@@ -44,18 +80,27 @@ struct cli_view_console : cli_view {
     bool readline(std::string & line, bool multiline_input) override;
     void set_completion_callback(completion_callback cb) override;
 
-    void print(const std::string & text) override;
-    void print_reasoning(const std::string & text) override;
-    void print_info(const std::string & text) override;
-    void print_user(const std::string & text) override;
-    void print_error(const std::string & text) override;
+    void show_loading(const std::string & message) override;
+    void hide_loading() override;
+    void show_banner(const cli_server_info & info) override;
+    void show_message(const std::string & text) override;
+    void show_error(const std::string & message) override;
+    void show_timings(const cli_timings & timings) override;
+
+    void prompt_user() override;
+    void echo_user(const std::string & text) override;
+    void end_user_input() override;
 
-    void spinner_start() override;
-    void spinner_stop() override;
-    void flush() override;
+    void begin_response() override;
+    void on_reasoning_delta(const std::string & text) override;
+    void on_content_delta(const std::string & text) override;
+    void end_response() override;
 
 private:
     void set_display(display_type display);
+    void stop_spinner();
 
     display_type curr_display = DISPLAY_TYPE_RESET;
+    bool is_busy     = false; // a spinner is being shown
+    bool is_thinking = false; // inside a streamed reasoning block
 };
diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index d59cbb7ec1bc..1b1db86571e9 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -169,8 +169,8 @@ int llama_cli(int argc, char ** argv) {
 
     // TODO: maybe support it later?
     if (params.conversation_mode == COMMON_CONVERSATION_MODE_DISABLED) {
-        g_view.print_error("--no-conversation is not supported by llama-cli\n");
-        g_view.print_error("please use llama-completion instead\n");
+        g_view.show_error("--no-conversation is not supported by llama-cli\n"
+                          "please use llama-completion instead");
     }
 
     // TODO: avoid using atexit() here by making `console` a singleton