ggml-org · pwilkin · Jun 12, 2026 · Jun 12, 2026
@@ -635,14 +635,17 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
     }
 
+    // with --server-base, the model is provided by the external server
+    const bool has_remote_server = !params.server_base.empty();
+
     // handle model and download
-    if (!skip_model_download) {
+    if (!skip_model_download && !has_remote_server) {
         common_params_handle_models(params, ctx_arg.ex);
     }
 
-    // model is required (except for server)
+    // model is required (except for server, or when an external server provides it)
     // TODO @ngxson : maybe show a list of available models in CLI in this case
-    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion) {
+    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion && !has_remote_server) {
         throw std::invalid_argument("error: --model is required\n");
     }
 
@@ -1547,6 +1550,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.single_turn = true;
         }
     ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
+    add_opt(common_arg(
+        {"--server-base"}, "URL",
+        "base url of an external llama-server instance to connect to, e.g. http://localhost:8080\n"
+        "when set, llama-cli does not spawn a local server and model args are ignored\n"
+        "(default: unset, spawn a local llama-server)",
+        [](common_params & params, const std::string & value) {
+            params.server_base = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SERVER_BASE"));
     add_opt(common_arg(
         {"-i", "--interactive"},
         string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),

@@ -563,6 +563,8 @@ struct common_params {
 
     bool single_turn       = false; // single turn chat conversation
 
+    std::string server_base = ""; // base url of an external llama-server to connect to (CLI) // NOLINT
+
     ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
     ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
 

@@ -2,11 +2,21 @@
 
 set(TARGET llama-cli-impl)
 
-add_library(${TARGET} cli.cpp)
+add_library(${TARGET}
+    cli.cpp
+    cli-client.cpp
+    cli-client.h
+    cli-context.cpp
+    cli-context.h
+    cli-server.cpp
+    cli-server.h
+    cli-view.cpp
+    cli-view.h
+)
 set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
-target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ../server)
-target_link_libraries(${TARGET} PUBLIC server-context llama-common ${CMAKE_THREAD_LIBS_INIT})
+target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(${TARGET} PUBLIC llama-common cpp-httplib ${CMAKE_THREAD_LIBS_INIT})
 
 if(LLAMA_TOOLS_INSTALL)
     install(TARGETS ${TARGET} LIBRARY)

@@ -0,0 +1,141 @@
+#include "cli-client.h"
+
+#include "http.h"
+
+#include <algorithm>
+#include <chrono>
+#include <thread>
+
+// generation can stall for a long time during prompt processing, so the
+// read timeout must be generous
+static constexpr time_t CLI_HTTP_READ_TIMEOUT_SEC = 3600;
+
+// upper bound for the accumulated response body kept for error reporting
+static constexpr size_t CLI_HTTP_MAX_ERROR_BODY = 1024 * 1024;
+
+// returns the path with the base url's path prefix prepended (if any)
+static std::string join_path(const common_http_url & parts, const std::string & path) {
+    if (parts.path.empty() || parts.path == "/") {
+        return path;
+    }
+    std::string prefix = parts.path;
+    if (prefix.back() == '/') {
+        prefix.pop_back();
+    }
+    return prefix + path;
+}
+
+json cli_client::get(const std::string & path) {
+    auto [cli, parts] = common_http_client(server_base);
+    cli.set_read_timeout(CLI_HTTP_READ_TIMEOUT_SEC, 0);
+    auto res = cli.Get(join_path(parts, path));
+    if (!res) {
+        throw std::runtime_error("failed to connect to " + server_base + ": " + httplib::to_string(res.error()));
+    }
+    if (res->status < 200 || res->status >= 300) {
+        throw std::runtime_error("GET " + path + " failed with status " + std::to_string(res->status) + ": " + res->body);
+    }
+    json result = json::parse(res->body, nullptr, false);
+    if (result.is_discarded()) {
+        throw std::runtime_error("GET " + path + " returned invalid JSON");
+    }
+    return result;
+}
+
+json cli_client::post(const std::string & path, const json & body) {
+    auto [cli, parts] = common_http_client(server_base);
+    cli.set_read_timeout(CLI_HTTP_READ_TIMEOUT_SEC, 0);
+    auto res = cli.Post(join_path(parts, path), body.dump(), "application/json");
+    if (!res) {
+        throw std::runtime_error("failed to connect to " + server_base + ": " + httplib::to_string(res.error()));
+    }
+    if (res->status < 200 || res->status >= 300) {
+        throw std::runtime_error("POST " + path + " failed with status " + std::to_string(res->status) + ": " + res->body);
+    }
+    json result = json::parse(res->body, nullptr, false);
+    if (result.is_discarded()) {
+        throw std::runtime_error("POST " + path + " returned invalid JSON");
+    }
+    return result;
+}
+
+json cli_client::post_sse(const std::string & path,
+                          const json & body,
+                          const std::function<bool()> & should_stop,
+                          const std::function<void(const json &)> & on_data) {
+    auto [cli, parts] = common_http_client(server_base);
+    cli.set_read_timeout(CLI_HTTP_READ_TIMEOUT_SEC, 0);
+
+    std::string pending;  // buffer for incomplete SSE lines
+    std::string raw_body; // accumulated body, used only for error reporting
+
+    auto receiver = [&](const char * data, size_t len) -> bool {
+        if (should_stop()) {
+            return false; // aborts the request
+        }
+        if (raw_body.size() < CLI_HTTP_MAX_ERROR_BODY) {
+            raw_body.append(data, std::min(len, CLI_HTTP_MAX_ERROR_BODY - raw_body.size()));
+        }
+        pending.append(data, len);
+        size_t pos;
+        while ((pos = pending.find('\n')) != std::string::npos) {
+            std::string line = pending.substr(0, pos);
+            pending.erase(0, pos + 1);
+            if (!line.empty() && line.back() == '\r') {
+                line.pop_back();
+            }
+            if (line.rfind("data: ", 0) != 0) {
+                continue;
+            }
+            std::string payload = line.substr(6);
+            if (payload == "[DONE]") {
+                continue;
+            }
+            json event = json::parse(payload, nullptr, false);
+            if (!event.is_discarded()) {
+                on_data(event);
+            }
+        }
+        return true;
+    };
+
+    httplib::Headers headers = {{"Accept", "text/event-stream"}};
+    auto res = cli.Post(join_path(parts, path), headers, body.dump(), "application/json", receiver);
+
+    if (!res) {
+        if (res.error() == httplib::Error::Canceled && should_stop()) {
+            return json(); // cancelled by the user
+        }
+        return json {{"error", {{"message", "failed to connect to " + server_base + ": " + httplib::to_string(res.error())}}}};
+    }
+    if (res->status < 200 || res->status >= 300) {
+        json error_body = json::parse(raw_body, nullptr, false);
+        if (!error_body.is_discarded() && error_body.contains("error")) {
+            return error_body;
+        }
+        return json {{"error", {{"message", "request failed with status " + std::to_string(res->status)}}}};
+    }
+    return json();
+}
+
+bool cli_client::wait_health(const std::function<bool()> & is_aborted) {
+    int connect_attempts = 0;
+    while (!is_aborted()) {
+        auto [cli, parts] = common_http_client(server_base);
+        cli.set_connection_timeout(1, 0);
+        auto res = cli.Get(join_path(parts, "/health"));
+        if (res) {
+            if (res->status == 200) {
+                return true;
+            }
+            // any other status means the server is up but not ready yet
+            // (e.g. 503 while the model is still loading)
+        } else if (++connect_attempts >= 10) {
+            last_error = "failed to connect to " + server_base + ": " + httplib::to_string(res.error());
+            return false;
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds(300));
+    }
+    last_error = "aborted while waiting for the server to become ready";
+    return false;
+}
@@ -0,0 +1,57 @@
+// HTTP API client for llama-cli (the "model" in MVC)
+//
+// a thin client for the llama-server HTTP API, roughly equivalent to the
+// openai python package: it only provides data and never touches the view;
+// real-time responses are delivered through callbacks
+
+#pragma once
+
+#include "ggml.h" // for GGML_ASSERT
+
+#define JSON_ASSERT GGML_ASSERT
+#include <nlohmann/json.hpp>
+
+#include <functional>
+#include <string>
+
+using json = nlohmann::ordered_json;
+
+struct cli_client {
+    std::string server_base; // base url, for example "http://127.0.0.1:8080"
+    std::string last_error;  // set when wait_health() fails
+
+    // simple GET request, returns the response json
+    // throws std::runtime_error on transport error or non-2xx status
+    json get(const std::string & path);
+
+    // simple POST request, returns the response json
+    // throws std::runtime_error on transport error or non-2xx status
+    json post(const std::string & path, const json & body);
+
+    // POST request with an SSE streaming response; on_data is invoked once
+    // per "data:" event; the function returns after the stream is finished:
+    // a null json on graceful exit (incl. cancellation via should_stop),
+    // the error response json otherwise
+    json post_sse(const std::string & path,
+                  const json & body,
+                  const std::function<bool()> & should_stop,
+                  const std::function<void(const json &)> & on_data);
+
+    // poll /health until the server is ready to accept requests
+    // returns false if is_aborted returned true or the server is unreachable
+    bool wait_health(const std::function<bool()> & is_aborted);
+
+    //
+    // higher-level wrappers
+    //
+
+    json create_chat_completion(const json & request,
+                                const std::function<bool()> & should_stop,
+                                const std::function<void(const json &)> & on_data) {
+        return post_sse("/v1/chat/completions", request, should_stop, on_data);
+    }
+
+    json get_props() {
+        return get("/props");
+    }
+};