From 59797670dcc801f62fb40a6abd1bb65405967119 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 23 Jun 2026 13:14:28 +0200 Subject: [PATCH 1/9] cli: move to HTTP-based implementation --- common/common.h | 3 + common/http.h | 70 +++++ tools/cli/CMakeLists.txt | 4 +- tools/cli/cli-client.cpp | 141 +++++++++ tools/cli/cli-client.h | 52 ++++ tools/cli/cli-context.cpp | 547 +++++++++++++++++++++++++++++++++ tools/cli/cli-context.h | 82 +++++ tools/cli/cli-server.h | 73 +++++ tools/cli/cli-view.h | 103 +++++++ tools/cli/cli.cpp | 515 +------------------------------ tools/server/server-models.cpp | 72 +---- 11 files changed, 1092 insertions(+), 570 deletions(-) create mode 100644 tools/cli/cli-client.cpp create mode 100644 tools/cli/cli-client.h create mode 100644 tools/cli/cli-context.cpp create mode 100644 tools/cli/cli-context.h create mode 100644 tools/cli/cli-server.h create mode 100644 tools/cli/cli-view.h diff --git a/common/common.h b/common/common.h index f2f2202ec2d5..0b9dd4976688 100644 --- a/common/common.h +++ b/common/common.h @@ -631,6 +631,9 @@ struct common_params { std::map default_template_kwargs; + // CLI params + std::string server_base; + // UI configs bool ui = true; bool ui_mcp_proxy = false; diff --git a/common/http.h b/common/http.h index d3daccd6bf48..0c51d094ac3d 100644 --- a/common/http.h +++ b/common/http.h @@ -2,6 +2,16 @@ #include +#ifdef _WIN32 +#include +#include +#else +#include +#include +#include +#include +#endif + struct common_http_url { std::string scheme; std::string user; @@ -97,3 +107,63 @@ static std::pair common_http_client(const std: static std::string common_http_show_masked_url(const common_http_url & parts) { return parts.scheme + "://" + (parts.user.empty() ? "" : "****:****@") + parts.host + parts.path; } + +static int common_http_get_free_port() { +#ifdef _WIN32 + WSADATA wsaData; + if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) { + return -1; + } + typedef SOCKET native_socket_t; +#define INVALID_SOCKET_VAL INVALID_SOCKET +#define CLOSE_SOCKET(s) closesocket(s) +#else + typedef int native_socket_t; +#define INVALID_SOCKET_VAL -1 +#define CLOSE_SOCKET(s) close(s) +#endif + + native_socket_t sock = socket(AF_INET, SOCK_STREAM, 0); + if (sock == INVALID_SOCKET_VAL) { +#ifdef _WIN32 + WSACleanup(); +#endif + return -1; + } + + struct sockaddr_in serv_addr; + std::memset(&serv_addr, 0, sizeof(serv_addr)); + serv_addr.sin_family = AF_INET; + serv_addr.sin_addr.s_addr = htonl(INADDR_ANY); + serv_addr.sin_port = htons(0); + + if (bind(sock, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) != 0) { + CLOSE_SOCKET(sock); +#ifdef _WIN32 + WSACleanup(); +#endif + return -1; + } + +#ifdef _WIN32 + int namelen = sizeof(serv_addr); +#else + socklen_t namelen = sizeof(serv_addr); +#endif + if (getsockname(sock, (struct sockaddr*)&serv_addr, &namelen) != 0) { + CLOSE_SOCKET(sock); +#ifdef _WIN32 + WSACleanup(); +#endif + return -1; + } + + int port = ntohs(serv_addr.sin_port); + + CLOSE_SOCKET(sock); +#ifdef _WIN32 + WSACleanup(); +#endif + + return port; +} diff --git a/tools/cli/CMakeLists.txt b/tools/cli/CMakeLists.txt index a3e635719b67..2fa648bf0c89 100644 --- a/tools/cli/CMakeLists.txt +++ b/tools/cli/CMakeLists.txt @@ -2,7 +2,9 @@ set(TARGET llama-cli-impl) -add_library(${TARGET} cli.cpp) +add_library(${TARGET} cli.cpp + cli-client.cpp + cli-context.cpp) set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ../server) diff --git a/tools/cli/cli-client.cpp b/tools/cli/cli-client.cpp new file mode 100644 index 000000000000..d45affba931c --- /dev/null +++ b/tools/cli/cli-client.cpp @@ -0,0 +1,141 @@ +#include "cli-client.h" + +#include "http.h" + +#include +#include +#include + +// generation can stall for a long time during prompt processing, so the +// read timeout must be generous +static constexpr time_t CLI_HTTP_READ_TIMEOUT_SEC = 3600; + +// upper bound for the accumulated response body kept for error reporting +static constexpr size_t CLI_HTTP_MAX_ERROR_BODY = 1024 * 1024; + +// returns the path with the base url's path prefix prepended (if any) +static std::string join_path(const common_http_url & parts, const std::string & path) { + if (parts.path.empty() || parts.path == "/") { + return path; + } + std::string prefix = parts.path; + if (prefix.back() == '/') { + prefix.pop_back(); + } + return prefix + path; +} + +json cli_client::get(const std::string & path) { + auto [cli, parts] = common_http_client(server_base); + cli.set_read_timeout(CLI_HTTP_READ_TIMEOUT_SEC, 0); + auto res = cli.Get(join_path(parts, path)); + if (!res) { + throw std::runtime_error("failed to connect to " + server_base + ": " + httplib::to_string(res.error())); + } + if (res->status < 200 || res->status >= 300) { + throw std::runtime_error("GET " + path + " failed with status " + std::to_string(res->status) + ": " + res->body); + } + json result = json::parse(res->body, nullptr, false); + if (result.is_discarded()) { + throw std::runtime_error("GET " + path + " returned invalid JSON"); + } + return result; +} + +json cli_client::post(const std::string & path, const json & body) { + auto [cli, parts] = common_http_client(server_base); + cli.set_read_timeout(CLI_HTTP_READ_TIMEOUT_SEC, 0); + auto res = cli.Post(join_path(parts, path), body.dump(), "application/json"); + if (!res) { + throw std::runtime_error("failed to connect to " + server_base + ": " + httplib::to_string(res.error())); + } + if (res->status < 200 || res->status >= 300) { + throw std::runtime_error("POST " + path + " failed with status " + std::to_string(res->status) + ": " + res->body); + } + json result = json::parse(res->body, nullptr, false); + if (result.is_discarded()) { + throw std::runtime_error("POST " + path + " returned invalid JSON"); + } + return result; +} + +json cli_client::post_sse(const std::string & path, + const json & body, + const std::function & should_stop, + const std::function & on_data) { + auto [cli, parts] = common_http_client(server_base); + cli.set_read_timeout(CLI_HTTP_READ_TIMEOUT_SEC, 0); + + std::string pending; // buffer for incomplete SSE lines + std::string raw_body; // accumulated body, used only for error reporting + + auto receiver = [&](const char * data, size_t len) -> bool { + if (should_stop()) { + return false; // aborts the request + } + if (raw_body.size() < CLI_HTTP_MAX_ERROR_BODY) { + raw_body.append(data, std::min(len, CLI_HTTP_MAX_ERROR_BODY - raw_body.size())); + } + pending.append(data, len); + size_t pos; + while ((pos = pending.find('\n')) != std::string::npos) { + std::string line = pending.substr(0, pos); + pending.erase(0, pos + 1); + if (!line.empty() && line.back() == '\r') { + line.pop_back(); + } + if (line.rfind("data: ", 0) != 0) { + continue; + } + std::string payload = line.substr(6); + if (payload == "[DONE]") { + continue; + } + json event = json::parse(payload, nullptr, false); + if (!event.is_discarded()) { + on_data(event); + } + } + return true; + }; + + httplib::Headers headers = {{"Accept", "text/event-stream"}}; + auto res = cli.Post(join_path(parts, path), headers, body.dump(), "application/json", receiver); + + if (!res) { + if (res.error() == httplib::Error::Canceled && should_stop()) { + return json(); // cancelled by the user + } + return json {{"error", {{"message", "failed to connect to " + server_base + ": " + httplib::to_string(res.error())}}}}; + } + if (res->status < 200 || res->status >= 300) { + json error_body = json::parse(raw_body, nullptr, false); + if (!error_body.is_discarded() && error_body.contains("error")) { + return error_body; + } + return json {{"error", {{"message", "request failed with status " + std::to_string(res->status)}}}}; + } + return json(); +} + +bool cli_client::wait_health(const std::function & is_aborted) { + int connect_attempts = 0; + while (!is_aborted()) { + auto [cli, parts] = common_http_client(server_base); + cli.set_connection_timeout(1, 0); + auto res = cli.Get(join_path(parts, "/health")); + if (res) { + if (res->status == 200) { + return true; + } + // any other status means the server is up but not ready yet + // (e.g. 503 while the model is still loading) + } else if (++connect_attempts >= 10) { + last_error = "failed to connect to " + server_base + ": " + httplib::to_string(res.error()); + return false; + } + std::this_thread::sleep_for(std::chrono::milliseconds(300)); + } + last_error = "aborted while waiting for the server to become ready"; + return false; +} diff --git a/tools/cli/cli-client.h b/tools/cli/cli-client.h new file mode 100644 index 000000000000..463deebf08f1 --- /dev/null +++ b/tools/cli/cli-client.h @@ -0,0 +1,52 @@ +#pragma once + +#include "ggml.h" + +#define JSON_ASSERT GGML_ASSERT +#include + +#include +#include + +using json = nlohmann::ordered_json; + +// openai-like client for CLI +struct cli_client { + std::string server_base; // base url, for example "http://127.0.0.1:8080" + std::string last_error; // set when wait_health() fails + + // simple GET request, returns the response json + // throws std::runtime_error on transport error or non-2xx status + json get(const std::string & path); + + // simple POST request, returns the response json + // throws std::runtime_error on transport error or non-2xx status + json post(const std::string & path, const json & body); + + // POST request with an SSE streaming response; on_data is invoked once + // per "data:" event; the function returns after the stream is finished: + // a null json on graceful exit (incl. cancellation via should_stop), + // the error response json otherwise + json post_sse(const std::string & path, + const json & body, + const std::function & should_stop, + const std::function & on_data); + + // poll /health until the server is ready to accept requests + // returns false if is_aborted returned true or the server is unreachable + bool wait_health(const std::function & is_aborted); + + // + // higher-level wrappers + // + + json create_chat_completion(const json & request, + const std::function & should_stop, + const std::function & on_data) { + return post_sse("/v1/chat/completions", request, should_stop, on_data); + } + + json get_props() { + return get("/props"); + } +}; diff --git a/tools/cli/cli-context.cpp b/tools/cli/cli-context.cpp new file mode 100644 index 000000000000..f5af5ef3b846 --- /dev/null +++ b/tools/cli/cli-context.cpp @@ -0,0 +1,547 @@ +#include "cli-context.h" +#include "cli-view.h" + +#include "arg.h" +#include "base64.hpp" +#include "log.h" +#include "console.h" + +#include +#include +#include +#include +#include + +std::atomic g_cli_interrupted = false; + +static bool should_stop() { + return g_cli_interrupted.load(); +} + +static constexpr size_t FILE_GLOB_MAX_RESULTS = 100; + +const char * LLAMA_ASCII_LOGO = R"( +▄▄ ▄▄ +██ ██ +██ ██ ▀▀█▄ ███▄███▄ ▀▀█▄ ▄████ ████▄ ████▄ +██ ██ ▄█▀██ ██ ██ ██ ▄█▀██ ██ ██ ██ ██ ██ +██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀ + ██ ██ + ▀▀ ▀▀ +)"; + +// number of values an arg consumes on the command line +static int arg_num_values(const common_arg & opt) { + if (opt.value_hint_2 != nullptr) { + return 2; + } + if (opt.value_hint != nullptr) { + return 1; + } + return 0; +} + +// keep only the args that llama-server understands, so that the remainder +// of the command line can be forwarded to the spawned server child +static std::vector filter_server_args(int argc, char ** argv) { + std::map cli_n_values; // arg -> number of values + std::set server_args; + + common_params dummy_cli; + auto ctx_cli = common_params_parser_init(dummy_cli, LLAMA_EXAMPLE_CLI); + for (const auto & opt : ctx_cli.options) { + for (const char * a : opt.args) { + cli_n_values[a] = arg_num_values(opt); + } + for (const char * a : opt.args_neg) { + cli_n_values[a] = 0; + } + } + + common_params dummy_server; + auto ctx_server = common_params_parser_init(dummy_server, LLAMA_EXAMPLE_SERVER); + for (const auto & opt : ctx_server.options) { + for (const char * a : opt.args) { + server_args.insert(a); + } + for (const char * a : opt.args_neg) { + server_args.insert(a); + } + } + + std::vector result; + for (int i = 1; i < argc; i++) { + const std::string arg = argv[i]; + auto it = cli_n_values.find(arg); + if (it == cli_n_values.end()) { + // not a known arg (should not happen when parsing succeeded) + continue; + } + const bool forward = server_args.count(arg) > 0; + if (forward) { + result.push_back(arg); + } + for (int j = 0; j < it->second && i + 1 < argc; j++) { + i++; + if (forward) { + result.push_back(argv[i]); + } + } + } + return result; +} + +static std::string format_error_message(const json & err) { + if (err.contains("error") && err.at("error").is_object()) { + const auto & e = err.at("error"); + if (e.contains("message") && e.at("message").is_string()) { + return e.at("message").get(); + } + } + return err.dump(); +} + +static std::string media_type_from_ext(const std::string & fname) { + std::string ext = std::filesystem::path(fname).extension().string(); + std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); + if (ext == ".wav" || ext == ".mp3") { + return "audio"; + } + if (ext == ".mp4" || ext == ".avi" || ext == ".mkv" || ext == ".mov" || ext == ".webm") { + return "video"; + } + return "image"; +} + +bool cli_context::init(int argc, char ** argv) { + std::optional spinner; + + if (!params.server_base.empty()) { + std::string base = params.server_base; + while (!base.empty() && base.back() == '/') { + base.pop_back(); + } + client.server_base = base; + + spinner.emplace("Connecting to server at " + base); + } else { + if (params.model.path.empty() && params.model.url.empty() && + params.model.hf_repo.empty() && params.model.docker_repo.empty()) { + view::show_error( + "no model specified", + "use -m or -hf to run a local model,\n" + "or --server-base to connect to a running llama-server" + ); + return false; + } + + spinner.emplace("Loading model..."); + + server.emplace(); + if (!server->start(filter_server_args(argc, argv))) { + view::show_error("server start failed"); + return false; + } + if (!server->wait_ready(should_stop)) { + if (!should_stop()) { + view::show_error("the server exited before becoming ready"); + } + return false; + } + client.server_base = server->address(); + } + + // for --server-base this is the main availability check; for a spawned + // server it is a cheap sanity check on top of the ready signal + auto is_aborted = [this]() { + return should_stop() || (server && !server->alive()); + }; + bool healthy = false; + try { + healthy = client.wait_health(is_aborted); + } catch (const std::exception & e) { + client.last_error = e.what(); + } + if (!healthy) { + if (!should_stop()) { + view::show_error(client.last_error); + } + return false; + } + + fetch_server_props(); + + return true; +} + +void cli_context::fetch_server_props() { + try { + json props = client.get_props(); + model_name = props.value("model_alias", ""); + if (model_name.empty()) { + const std::string path = props.value("model_path", ""); + if (!path.empty()) { + model_name = std::filesystem::path(path).filename().string(); + } + } + build_info = props.value("build_info", ""); + if (props.contains("modalities") && props.at("modalities").is_object()) { + const auto & modalities = props.at("modalities"); + has_vision = modalities.value("vision", false); + has_audio = modalities.value("audio", false); + has_video = modalities.value("video", false); + } + } catch (const std::exception & e) { + // /props can be disabled on remote servers; not fatal + LOG_DBG("failed to fetch /props: %s\n", e.what()); + } +} + +void cli_context::add_system_prompt() { + if (!params.system_prompt.empty()) { + messages.push_back({ + {"role", "system"}, + {"content", params.system_prompt} + }); + } +} + +void cli_context::push_user_message(const std::string & text) { + json content; + if (pending_media.empty()) { + content = text; + } else { + // multimodal message: media parts first, then the text + content = pending_media; + content.push_back({ + {"type", "text"}, + {"text", text} + }); + pending_media = json::array(); + } + messages.push_back({ + {"role", "user"}, + {"content", content} + }); +} + +bool cli_context::stage_media_file(const std::string & fname, const std::string & type) { + std::ifstream file(fname, std::ios::binary); + if (!file) { + return false; + } + std::string data((std::istreambuf_iterator(file)), std::istreambuf_iterator()); + std::string encoded = base64::encode(data); + + if (type == "audio") { + std::string ext = std::filesystem::path(fname).extension().string(); + std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); + pending_media.push_back({ + {"type", "input_audio"}, + {"input_audio", { + {"data", encoded}, + {"format", ext == ".mp3" ? "mp3" : "wav"} + }} + }); + } else if (type == "video") { + pending_media.push_back({ + {"type", "input_video"}, + {"input_video", { + {"data", encoded} + }} + }); + } else { + // the server detects the actual image type from the data + pending_media.push_back({ + {"type", "image_url"}, + {"image_url", { + {"url", "data:image/unknown;base64," + encoded} + }} + }); + } + return true; +} + +bool cli_context::generate_completion(std::string & assistant_content, cli_timings & timings) { + json body = { + {"messages", messages}, + {"stream", true}, + // in order to get timings even when we cancel mid-way + {"timings_per_token", true}, + }; + + bool stream_error = false; + + view::assistant_turn a; + + json err = client.create_chat_completion(body, should_stop, [&](const json & chunk) { + if (chunk.contains("error")) { + stream_error = true; + view::show_error(format_error_message(chunk)); + return; + } + if (chunk.contains("timings")) { + const auto & t = chunk.at("timings"); + timings.prompt_per_second = t.value("prompt_per_second", 0.0); + timings.predicted_per_second = t.value("predicted_per_second", 0.0); + } + if (!chunk.contains("choices") || !chunk.at("choices").is_array() || chunk.at("choices").empty()) { + return; + } + const auto & choice = chunk.at("choices").at(0); + if (!choice.contains("delta")) { + return; + } + const auto & delta = choice.at("delta"); + if (delta.contains("reasoning_content") && delta.at("reasoning_content").is_string()) { + const std::string text = delta.at("reasoning_content").get(); + if (!text.empty()) { + a.push(view::ASSISTANT_DISPLAY_MODE_REASONING, text); + } + } + if (delta.contains("content") && delta.at("content").is_string()) { + const std::string text = delta.at("content").get(); + if (!text.empty()) { + assistant_content += text; + a.push(view::ASSISTANT_DISPLAY_MODE_CONTENT, text); + } + } + }); + + g_cli_interrupted.store(false); + + if (!err.is_null()) { + view::show_error(format_error_message(err)); + return false; + } + return !stream_error; +} + +int cli_context::run() { + add_system_prompt(); + + std::string modalities = "text"; + if (has_vision) { + modalities += ", vision"; + } + if (has_audio) { + modalities += ", audio"; + } + if (has_video) { + modalities += ", video"; + } + + std::vector banner; + banner.push_back("\n"); + banner.push_back(LLAMA_ASCII_LOGO); + banner.push_back("\n"); + banner.push_back("build : " + build_info); + banner.push_back("model : " + model_name); + banner.push_back("modalities : " + modalities); + if (!params.system_prompt.empty()) { + console::log("using custom system prompt\n"); + } + console::log("\n"); + console::log("available commands:\n"); + console::log(" /exit or Ctrl+C stop or exit\n"); + console::log(" /regen regenerate the last response\n"); + console::log(" /clear clear the chat history\n"); + console::log(" /read add a text file\n"); + console::log(" /glob add text files using globbing pattern\n"); + if (has_vision) { + console::log(" /image add an image file\n"); + } + if (has_audio) { + console::log(" /audio add an audio file\n"); + } + if (has_video) { + console::log(" /video add a video file\n"); + } + console::log("\n"); + + view::show_banner(banner); + + // interactive loop + std::string cur_msg; + + auto add_text_file = [&](const std::string & fname) -> bool { + std::ifstream file(fname, std::ios::binary); + if (!file) { + view::show_error(string_format("file does not exist or cannot be opened: '%s'", fname.c_str())); + return false; + } + std::string content((std::istreambuf_iterator(file)), std::istreambuf_iterator()); + cur_msg += "--- File: "; + cur_msg += fname; + cur_msg += " ---\n"; + cur_msg += content; + view::show_message(string_format("Loaded text from '%s'", fname.c_str())); + return true; + }; + + while (true) { + std::string buffer; + { + view::user_turn user_turn; + + if (params.prompt.empty()) { + buffer = user_turn.read_input(params.multiline_input); + } else { + // process input prompt from args + for (auto & fname : params.image) { + if (!stage_media_file(fname, media_type_from_ext(fname))) { + view::show_error(string_format("file does not exist or cannot be opened: '%s'", fname.c_str())); + break; + } + view::show_message(string_format("Loaded media from '%s'", fname.c_str())); + } + buffer = params.prompt; + user_turn.echo(buffer); + params.prompt.clear(); // only use it once + } + } + + if (should_stop()) { + g_cli_interrupted.store(false); + break; + } + + // remove trailing newline + if (!buffer.empty() && buffer.back() == '\n') { + buffer.pop_back(); + } + + // skip empty messages + if (buffer.empty()) { + continue; + } + + bool add_user_msg = true; + + // process commands + if (string_starts_with(buffer, "/exit")) { + break; + } else if (string_starts_with(buffer, "/regen")) { + if (messages.size() >= 2) { + size_t last_idx = messages.size() - 1; + messages.erase(last_idx); + add_user_msg = false; + } else { + view::show_error("No message to regenerate."); + continue; + } + } else if (string_starts_with(buffer, "/clear")) { + messages.clear(); + add_system_prompt(); + + pending_media = json::array(); + view::show_message("Chat history cleared."); + continue; + } else if ( + (string_starts_with(buffer, "/image ") && has_vision) || + (string_starts_with(buffer, "/audio ") && has_audio) || + (string_starts_with(buffer, "/video ") && has_video)) { + std::string type = buffer.substr(1, 5); + // just in case (bad copy-paste for example), we strip all trailing/leading spaces + std::string fname = string_strip(buffer.substr(7)); + if (!stage_media_file(fname, type)) { + view::show_error(string_format("file does not exist or cannot be opened: '%s'", fname.c_str())); + continue; + } + view::show_message(string_format("Loaded media from '%s'", fname.c_str())); + continue; + } else if (string_starts_with(buffer, "/read ")) { + std::string fname = string_strip(buffer.substr(6)); + add_text_file(fname); + continue; + } else if (string_starts_with(buffer, "/glob ")) { + std::error_code ec; + size_t count = 0; + auto curdir = std::filesystem::current_path(); + std::string pattern = string_strip(buffer.substr(6)); + std::filesystem::path rel_path; + + auto startglob = pattern.find_first_of("![*?"); + if (startglob != std::string::npos && startglob != 0) { + auto endpath = pattern.substr(0, startglob).find_last_of('/'); + if (endpath != std::string::npos) { + std::string rel_pattern = pattern.substr(0, endpath); +#if !defined(_WIN32) + if (string_starts_with(rel_pattern, '~')) { + const char * home = std::getenv("HOME"); + if (home && home[0]) { + rel_pattern = home + rel_pattern.substr(1); + } + } +#endif + rel_path = rel_pattern; + pattern.erase(0, endpath + 1); + curdir /= rel_path; + } + } + + for (const auto & entry : std::filesystem::recursive_directory_iterator(curdir, + std::filesystem::directory_options::skip_permission_denied, ec)) { + if (!entry.is_regular_file()) { + continue; + } + + std::string rel = std::filesystem::relative(entry.path(), curdir, ec).string(); + if (ec) { + ec.clear(); + continue; + } + std::replace(rel.begin(), rel.end(), '\\', '/'); + + if (!glob_match(pattern, rel)) { + continue; + } + + if (!add_text_file((rel_path / rel).string())) { + continue; + } + + if (++count >= FILE_GLOB_MAX_RESULTS) { + view::show_error(string_format("Maximum number of globbed files allowed (%zu) reached.", FILE_GLOB_MAX_RESULTS)); + break; + } + } + continue; + } else { + // not a command + cur_msg += buffer; + } + + // generate response + if (add_user_msg) { + push_user_message(cur_msg); + cur_msg.clear(); + } + cli_timings timings; + std::string assistant_content; + generate_completion(assistant_content, timings); + messages.push_back({ + {"role", "assistant"}, + {"content", assistant_content} + }); + + if (params.show_timings) { + // TODO + } + + if (params.single_turn) { + break; + } + } + + view::show_message("Exiting..."); + + return 0; +} + +void cli_context::shutdown() { + if (server) { + server->stop(); + server.reset(); + } +} diff --git a/tools/cli/cli-context.h b/tools/cli/cli-context.h new file mode 100644 index 000000000000..ef55b2c9b470 --- /dev/null +++ b/tools/cli/cli-context.h @@ -0,0 +1,82 @@ +// controller for llama-cli (the "controller" in MVC) +// +// owns the chat state, drives the view and talks to llama-server through +// cli_client; when no --server-base is given it also manages a local +// llama-server child process via cli_server + +#pragma once + +#include "common.h" + +#include "cli-client.h" +#include "cli-server.h" + +#include +#include +#include + +struct cli_timings { + double prompt_per_second = 0.0; + double predicted_per_second = 0.0; +}; + +struct cli_command_info { + std::string usage; // e.g. "/read " + std::string description; // e.g. "add a text file" +}; + +// properties of the connected server, shown on startup +struct cli_server_info { + std::string build_info; + std::string model_name; + std::string server_base; + bool is_local_server = false; // server is spawned and managed by llama-cli + bool has_system_prompt = false; + bool has_vision = false; + bool has_audio = false; + bool has_video = false; + + std::vector commands; +}; + +// set by the SIGINT handler; cleared once the interrupt has been handled +extern std::atomic g_cli_interrupted; + +struct cli_context { + common_params params; + + cli_client client; // always initialized + std::optional server; // only set when no --server-base is given + + json messages = json::array(); + json pending_media = json::array(); // staged multimodal content parts + + // properties of the connected server + std::string model_name; + std::string build_info; + bool has_vision = false; + bool has_audio = false; + bool has_video = false; + + cli_context(const common_params & params) : params(params) {} + + // connect to --server-base or spawn a local llama-server child; + // argc/argv are needed to forward the server-relevant args to the child + bool init(int argc, char ** argv); + + // run the interactive chat loop, returns the process exit code + int run(); + + // stop the local server child (if any) + void shutdown(); + +private: + bool generate_completion(std::string & assistant_content, cli_timings & timings); + void fetch_server_props(); + void add_system_prompt(); + void push_user_message(const std::string & text); + + // read a file and stage it as a multimodal content part; type is one of + // "image", "audio", "video"; returns false if the file cannot be read + bool stage_media_file(const std::string & fname, const std::string & type); +}; diff --git a/tools/cli/cli-server.h b/tools/cli/cli-server.h new file mode 100644 index 000000000000..8e6e388ce38d --- /dev/null +++ b/tools/cli/cli-server.h @@ -0,0 +1,73 @@ +#pragma once + +#include + +#include "http.h" + +// spawn llama-server in a thread and interact with it via a random port +// note: in the future, we may have a server running as daemon and the CLI can connect to it automatically + +// llama_server will be available as a dynamic library symbol +int llama_server(int argc, char ** argv); + +struct cli_server { + std::thread th; + int port = -1; + + ~cli_server() { + stop(); + } + + void stop() { + if (th.joinable()) { + th.detach(); + } + } + + bool start(std::vector args) { + port = common_http_get_free_port(); + if (port <= 0) { + fprintf(stderr, "failed to get a free port\n"); + exit(1); + } + + th = std::thread([&, args_ = args]() { + auto args = args_; // copy to modify + args.push_back("--port"); + args.push_back(std::to_string(port)); + + // convert to char* array + std::vector argv; + for (auto & arg : args) { + argv.push_back(arg.data()); + } + argv.push_back(nullptr); + + int res = llama_server(static_cast(args.size()), argv.data()); + if (res != 0) { + fprintf(stderr, "llama_server exited with code %d\n", res); + } + }); + + return true; + } + + std::string address() const { + return "http://127.0.0.1:" + std::to_string(port); + } + + bool wait_ready(std::function should_stop) { + // while (true) { + // if (should_stop()) { + // break; + // } + // std::this_thread::sleep_for(std::chrono::milliseconds(5000)); + // } + std::this_thread::sleep_for(std::chrono::milliseconds(5000)); + return true; + } + + bool alive() const { + return th.joinable(); + } +}; diff --git a/tools/cli/cli-view.h b/tools/cli/cli-view.h new file mode 100644 index 000000000000..6168f6ade3d5 --- /dev/null +++ b/tools/cli/cli-view.h @@ -0,0 +1,103 @@ +#pragma once + +#include "common.h" +#include "console.h" + +// note: make this view implementation generic, so that we can move to TUI in the future if we want to +namespace view { + using completion_callback = std::function>(std::string_view, size_t)>; + + static void set_completion_callback(completion_callback cb) { + console::set_completion_callback(std::move(cb)); + } + + static void init(const common_params & params) { + // TODO: avoid using atexit() here by making `console` a singleton + console::init(params.simple_io, params.use_color); + atexit([]() { console::cleanup(); }); + } + + struct spinner { + spinner(const std::string & message) { + console::log("%s\n", message.c_str()); + console::spinner::start(); + } + ~spinner() { + console::spinner::stop(); + } + }; + + struct user_turn { + user_turn() { + console::set_display(DISPLAY_TYPE_USER_INPUT); + } + ~user_turn() { + console::set_display(DISPLAY_TYPE_RESET); + } + void echo(const std::string & buffer) { + if (buffer.size() > 500) { + console::log("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str()); + } else { + console::log("\n> %s\n", buffer.c_str()); + } + } + std::string read_input(bool multiline_input) { + console::log("\n> "); + std::string buffer; + std::string line; + bool another_line = true; + do { + another_line = console::readline(line, multiline_input); + buffer += line; + } while (another_line); + return buffer; + } + }; + + enum assistant_display_mode { + ASSISTANT_DISPLAY_MODE_REASONING, + ASSISTANT_DISPLAY_MODE_CONTENT, + }; + struct assistant_turn { + assistant_display_mode mode = ASSISTANT_DISPLAY_MODE_CONTENT; + assistant_turn() { + console::set_display(DISPLAY_TYPE_RESET); + } + ~assistant_turn() { + console::set_display(DISPLAY_TYPE_RESET); + } + void push(assistant_display_mode m, const std::string & buffer) { + if (m != mode) { + switch (m) { + case ASSISTANT_DISPLAY_MODE_CONTENT: + console::set_display(DISPLAY_TYPE_RESET); + break; + case ASSISTANT_DISPLAY_MODE_REASONING: + console::set_display(DISPLAY_TYPE_REASONING); + break; + } + } + mode = m; + console::log("%s", buffer.c_str()); + console::flush(); + } + }; + + static void show_error(const std::string & title, const std::string & message = "") { + console::spinner::stop(); + console::error("Error: %s\n", title.c_str()); + if (!message.empty()) { + console::log("%s\n", message.c_str()); + } + } + + static void show_message(const std::string & message) { + console::log("%s\n", message.c_str()); + } + + static void show_banner(const std::vector & lines) { + for (const auto & line : lines) { + console::log("%s\n", line.c_str()); + } + } +}; diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 8b7b58693fc9..f16f18294e97 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -1,20 +1,14 @@ -#include "chat.h" -#include "common.h" #include "arg.h" -#include "console.h" -#include "fit.h" -// #include "log.h" +#include "common.h" +#include "log.h" -#include "server-common.h" -#include "server-context.h" -#include "server-task.h" +#include "cli-context.h" +#include "cli-view.h" #include -#include #include #include -#include -#include +#include #include #if defined(_WIN32) @@ -25,222 +19,19 @@ #include #endif -const char * LLAMA_ASCII_LOGO = R"( -▄▄ ▄▄ -██ ██ -██ ██ ▀▀█▄ ███▄███▄ ▀▀█▄ ▄████ ████▄ ████▄ -██ ██ ▄█▀██ ██ ██ ██ ▄█▀██ ██ ██ ██ ██ ██ -██ ██ ▀█▄██ ██ ██ ██ ▀█▄██ ██ ▀████ ████▀ ████▀ - ██ ██ - ▀▀ ▀▀ -)"; - -static std::atomic g_is_interrupted = false; -static bool should_stop() { - return g_is_interrupted.load(); -} - #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) static void signal_handler(int) { - if (g_is_interrupted.load()) { + if (g_cli_interrupted.load()) { // second Ctrl+C - exit immediately // make sure to clear colors before exiting (not using LOG or console.cpp here to avoid deadlock) fprintf(stdout, "\033[0m\n"); fflush(stdout); std::exit(130); } - g_is_interrupted.store(true); + g_cli_interrupted.store(true); } #endif -struct cli_context { - server_context ctx_server; - json messages = json::array(); - std::vector input_files; - task_params defaults; - bool verbose_prompt; - - // thread for showing "loading" animation - std::atomic loading_show; - - cli_context(const common_params & params) { - defaults.sampling = params.sampling; - defaults.speculative = params.speculative; - defaults.n_keep = params.n_keep; - defaults.n_predict = params.n_predict; - defaults.antiprompt = params.antiprompt; - - defaults.stream = true; // make sure we always use streaming mode - defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way - // defaults.return_progress = true; // TODO: show progress - - verbose_prompt = params.verbose_prompt; - } - - std::string generate_completion(result_timings & out_timings) { - server_response_reader rd = ctx_server.get_response_reader(); - auto chat_params = format_chat(); - { - // TODO: reduce some copies here in the future - server_task task = server_task(SERVER_TASK_TYPE_COMPLETION); - task.id = rd.get_new_id(); - task.index = 0; - task.params = defaults; // copy - task.cli_prompt = chat_params.prompt; // copy - task.cli_files = input_files; // copy - task.cli = true; - - // chat template settings - task.params.chat_parser_params = common_chat_parser_params(chat_params); - task.params.chat_parser_params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; - if (!chat_params.parser.empty()) { - task.params.chat_parser_params.parser.load(chat_params.parser); - } - - // Copy the preserved tokens into the sampling params - const llama_vocab * vocab = llama_model_get_vocab( - llama_get_model(ctx_server.get_llama_context())); - for (const auto & token : chat_params.preserved_tokens) { - auto ids = common_tokenize(vocab, token, false, true); - if (ids.size() == 1) { - task.params.sampling.preserved_tokens.insert(ids[0]); - } - } - - // reasoning budget sampler - if (!chat_params.thinking_end_tag.empty()) { - task.params.sampling.reasoning_budget_tokens = defaults.sampling.reasoning_budget_tokens; - task.params.sampling.generation_prompt = chat_params.generation_prompt; - - if (!chat_params.thinking_start_tag.empty()) { - task.params.sampling.reasoning_budget_start = - common_tokenize(vocab, chat_params.thinking_start_tag, false, true); - } - task.params.sampling.reasoning_budget_end = - common_tokenize(vocab, chat_params.thinking_end_tag, false, true); - task.params.sampling.reasoning_budget_forced = - common_tokenize(vocab, defaults.sampling.reasoning_budget_message + chat_params.thinking_end_tag, false, true); - } - - rd.post_task({std::move(task)}); - } - - if (verbose_prompt) { - console::set_display(DISPLAY_TYPE_PROMPT); - console::log("%s\n\n", chat_params.prompt.c_str()); - console::set_display(DISPLAY_TYPE_RESET); - } - - // wait for first result - console::spinner::start(); - server_task_result_ptr result = rd.next(should_stop); - - while (true) { - auto res_partial = dynamic_cast(result.get()); - if (res_partial && res_partial->is_begin) { - // this is the "send 200 status to client" signal in streaming mode - // skip, do not stop the spinner - result = rd.next(should_stop); - } else { - console::spinner::stop(); - break; - } - } - - std::string curr_content; - bool is_thinking = false; - - while (result) { - if (should_stop()) { - break; - } - if (result->is_error()) { - json err_data = result->to_json(); - if (err_data.contains("message")) { - console::error("Error: %s\n", err_data["message"].get().c_str()); - } else { - console::error("Error: %s\n", err_data.dump().c_str()); - } - return curr_content; - } - auto res_partial = dynamic_cast(result.get()); - if (res_partial) { - out_timings = std::move(res_partial->timings); - for (const auto & diff : res_partial->oaicompat_msg_diffs) { - if (!diff.content_delta.empty()) { - if (is_thinking) { - console::log("\n[End thinking]\n\n"); - console::set_display(DISPLAY_TYPE_RESET); - is_thinking = false; - } - curr_content += diff.content_delta; - console::log("%s", diff.content_delta.c_str()); - console::flush(); - } - if (!diff.reasoning_content_delta.empty()) { - console::set_display(DISPLAY_TYPE_REASONING); - if (!is_thinking) { - console::log("[Start thinking]\n"); - } - is_thinking = true; - console::log("%s", diff.reasoning_content_delta.c_str()); - console::flush(); - } - } - } - auto res_final = dynamic_cast(result.get()); - if (res_final) { - out_timings = std::move(res_final->timings); - break; - } - result = rd.next(should_stop); - } - g_is_interrupted.store(false); - // server_response_reader automatically cancels pending tasks upon destruction - return curr_content; - } - - // TODO: support remote files in the future (http, https, etc) - std::string load_input_file(const std::string & fname, bool is_media) { - std::ifstream file = fs_open_ifstream(fname, std::ios::binary); - if (!file) { - return ""; - } - if (is_media) { - raw_buffer buf; - buf.assign((std::istreambuf_iterator(file)), std::istreambuf_iterator()); - input_files.push_back(std::move(buf)); - return get_media_marker(); - } else { - std::string content((std::istreambuf_iterator(file)), std::istreambuf_iterator()); - return content; - } - } - - common_chat_params format_chat() { - auto meta = ctx_server.get_meta(); - auto & chat_params = meta.chat_params; - - auto caps = common_chat_templates_get_caps(chat_params.tmpls.get()); - - common_chat_templates_inputs inputs; - inputs.messages = common_chat_msgs_parse_oaicompat(messages); - inputs.tools = {}; // TODO - inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE; - inputs.json_schema = ""; // TODO - inputs.grammar = ""; // TODO - inputs.use_jinja = chat_params.use_jinja; - inputs.parallel_tool_calls = caps["supports_parallel_tool_calls"]; - inputs.add_generation_prompt = true; - inputs.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; - inputs.force_pure_content = chat_params.force_pure_content; - inputs.enable_thinking = chat_params.enable_thinking ? common_chat_templates_support_enable_thinking(chat_params.tmpls.get()) : false; - - // Apply chat template to the list of messages - return common_chat_templates_apply(chat_params.tmpls.get(), inputs); - } -}; - // TODO?: Make this reusable, enums, docs static const std::array cmds = { "/audio ", @@ -359,8 +150,6 @@ static std::vector> auto_completion_callback(std: return matches; } -static constexpr size_t FILE_GLOB_MAX_RESULTS = 100; - // satisfies -Wmissing-declarations int llama_cli(int argc, char ** argv); @@ -375,24 +164,7 @@ int llama_cli(int argc, char ** argv) { return 1; } - // TODO: maybe support it later? - if (params.conversation_mode == COMMON_CONVERSATION_MODE_DISABLED) { - console::error("--no-conversation is not supported by llama-cli\n"); - console::error("please use llama-completion instead\n"); - } - - // struct that contains llama context and inference - cli_context ctx_cli(params); - - llama_backend_init(); - llama_numa_init(params.numa); - - // TODO: avoid using atexit() here by making `console` a singleton - console::init(params.simple_io, params.use_color); - atexit([]() { console::cleanup(); }); - - console::set_display(DISPLAY_TYPE_RESET); - console::set_completion_callback(auto_completion_callback); + view::set_completion_callback(auto_completion_callback); #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; @@ -408,273 +180,16 @@ int llama_cli(int argc, char ** argv) { SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif - console::log("\nLoading model... "); // followed by loading animation - console::spinner::start(); - if (!ctx_cli.ctx_server.load_model(params)) { - console::spinner::stop(); - console::error("\nFailed to load the model\n"); - return 1; - } - - ctx_cli.defaults.sampling = params.sampling; - - console::spinner::stop(); - console::log("\n"); - - std::thread inference_thread([&ctx_cli]() { - ctx_cli.ctx_server.start_loop(); - }); - - auto inf = ctx_cli.ctx_server.get_meta(); - std::string modalities = "text"; - if (inf.has_inp_image) { - modalities += ", vision"; - } - if (inf.has_inp_audio) { - modalities += ", audio"; - } - - auto add_system_prompt = [&]() { - if (!params.system_prompt.empty()) { - ctx_cli.messages.push_back({ - {"role", "system"}, - {"content", params.system_prompt} - }); - } - }; - add_system_prompt(); - - console::log("\n"); - console::log("%s\n", LLAMA_ASCII_LOGO); - console::log("build : %s\n", inf.build_info.c_str()); - console::log("model : %s\n", inf.model_name.c_str()); - console::log("modalities : %s\n", modalities.c_str()); - if (!params.system_prompt.empty()) { - console::log("using custom system prompt\n"); - } - console::log("\n"); - console::log("available commands:\n"); - console::log(" /exit or Ctrl+C stop or exit\n"); - console::log(" /regen regenerate the last response\n"); - console::log(" /clear clear the chat history\n"); - console::log(" /read add a text file\n"); - console::log(" /glob add text files using globbing pattern\n"); - if (inf.has_inp_image) { - console::log(" /image add an image file\n"); - } - if (inf.has_inp_audio) { - console::log(" /audio add an audio file\n"); - } - if (inf.has_inp_video) { - console::log(" /video add a video file\n"); - } - console::log("\n"); - - // interactive loop - std::string cur_msg; - - auto add_text_file = [&](const std::string & fname) -> bool { - std::string marker = ctx_cli.load_input_file(fname, false); - if (marker.empty()) { - console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str()); - return false; - } - if (inf.fim_sep_token != LLAMA_TOKEN_NULL) { - cur_msg += common_token_to_piece(ctx_cli.ctx_server.get_llama_context(), inf.fim_sep_token, true); - cur_msg += fname; - cur_msg.push_back('\n'); - } else { - cur_msg += "--- File: "; - cur_msg += fname; - cur_msg += " ---\n"; - } - cur_msg += marker; - console::log("Loaded text from '%s'\n", fname.c_str()); - return true; - }; - - while (true) { - std::string buffer; - console::set_display(DISPLAY_TYPE_USER_INPUT); - if (params.prompt.empty()) { - console::log("\n> "); - std::string line; - bool another_line = true; - do { - another_line = console::readline(line, params.multiline_input); - buffer += line; - } while (another_line); - } else { - // process input prompt from args - for (auto & fname : params.image) { - std::string marker = ctx_cli.load_input_file(fname, true); - if (marker.empty()) { - console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str()); - break; - } - console::log("Loaded media from '%s'\n", fname.c_str()); - cur_msg += marker; - } - buffer = params.prompt; - if (buffer.size() > 500) { - console::log("\n> %s ... (truncated)\n", buffer.substr(0, 500).c_str()); - } else { - console::log("\n> %s\n", buffer.c_str()); - } - params.prompt.clear(); // only use it once - } - console::set_display(DISPLAY_TYPE_RESET); - console::log("\n"); - - if (should_stop()) { - g_is_interrupted.store(false); - break; - } - - // remove trailing newline - if (!buffer.empty() &&buffer.back() == '\n') { - buffer.pop_back(); - } - - // skip empty messages - if (buffer.empty()) { - continue; - } - - bool add_user_msg = true; - - // process commands - if (string_starts_with(buffer, "/exit")) { - break; - } else if (string_starts_with(buffer, "/regen")) { - if (ctx_cli.messages.size() >= 2) { - size_t last_idx = ctx_cli.messages.size() - 1; - ctx_cli.messages.erase(last_idx); - add_user_msg = false; - } else { - console::error("No message to regenerate.\n"); - continue; - } - } else if (string_starts_with(buffer, "/clear")) { - ctx_cli.messages.clear(); - add_system_prompt(); - - ctx_cli.input_files.clear(); - console::log("Chat history cleared.\n"); - continue; - } else if ( - (string_starts_with(buffer, "/image ") && inf.has_inp_image) || - (string_starts_with(buffer, "/audio ") && inf.has_inp_audio) || - (string_starts_with(buffer, "/video ") && inf.has_inp_video)) { - // just in case (bad copy-paste for example), we strip all trailing/leading spaces - std::string fname = string_strip(buffer.substr(7)); - std::string marker = ctx_cli.load_input_file(fname, true); - if (marker.empty()) { - console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str()); - continue; - } - cur_msg += marker; - console::log("Loaded media from '%s'\n", fname.c_str()); - continue; - } else if (string_starts_with(buffer, "/read ")) { - std::string fname = string_strip(buffer.substr(6)); - add_text_file(fname); - continue; - } else if (string_starts_with(buffer, "/glob ")) { - std::error_code ec; - size_t count = 0; - auto curdir = std::filesystem::current_path(); - std::string pattern = string_strip(buffer.substr(6)); - std::filesystem::path rel_path; - - auto startglob = pattern.find_first_of("![*?"); - if (startglob != std::string::npos && startglob != 0) { - auto endpath = pattern.substr(0, startglob).find_last_of('/'); - if (endpath != std::string::npos) { - std::string rel_pattern = pattern.substr(0, endpath); -#if !defined(_WIN32) - if (string_starts_with(rel_pattern, '~')) { - const char * home = std::getenv("HOME"); - if (home && home[0]) { - rel_pattern = home + rel_pattern.substr(1); - } - } -#endif - rel_path = rel_pattern; - pattern.erase(0, endpath + 1); - curdir /= rel_path; - } - } - - for (const auto & entry : std::filesystem::recursive_directory_iterator(curdir, - std::filesystem::directory_options::skip_permission_denied, ec)) { - if (!entry.is_regular_file()) { - continue; - } - - std::string rel = std::filesystem::relative(entry.path(), curdir, ec).string(); - if (ec) { - ec.clear(); - continue; - } - std::replace(rel.begin(), rel.end(), '\\', '/'); - - if (!glob_match(pattern, rel)) { - continue; - } - - if (!add_text_file((rel_path / rel).string())) { - continue; - } - - if (++count >= FILE_GLOB_MAX_RESULTS) { - console::error("Maximum number of globbed files allowed (%zu) reached.\n", FILE_GLOB_MAX_RESULTS); - break; - } - } - continue; - } else { - // not a command - cur_msg += buffer; - } - - // generate response - if (add_user_msg) { - ctx_cli.messages.push_back({ - {"role", "user"}, - {"content", cur_msg} - }); - cur_msg.clear(); - } - result_timings timings; - std::string assistant_content = ctx_cli.generate_completion(timings); - ctx_cli.messages.push_back({ - {"role", "assistant"}, - {"content", assistant_content} - }); - console::log("\n"); - - if (params.show_timings) { - console::set_display(DISPLAY_TYPE_INFO); - console::log("\n"); - console::log("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second); - console::set_display(DISPLAY_TYPE_RESET); - } + cli_context ctx_cli(params); - if (params.single_turn) { - break; - } + if (!ctx_cli.init(argc, argv)) { + ctx_cli.shutdown(); + return 1; } - console::set_display(DISPLAY_TYPE_RESET); - - console::log("\nExiting...\n"); - ctx_cli.ctx_server.terminate(); - inference_thread.join(); + int ret = ctx_cli.run(); - // bump the log level to display timings - common_log_set_verbosity_thold(LOG_LEVEL_INFO); - common_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context()); + ctx_cli.shutdown(); - return 0; + return ret; } diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index a87e4e423ede..cf0bc845eae1 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -5,6 +5,7 @@ #include "build-info.h" #include "preset.h" #include "download.h" +#include "http.h" #include // TODO: remove this once we use HTTP client from download.h #include @@ -25,14 +26,7 @@ #include #include -#ifdef _WIN32 -#include -#include -#else -#include -#include -#include -#include +#ifndef _WIN32 extern char **environ; #endif @@ -704,66 +698,6 @@ std::optional server_models::get_meta(const std::string & nam return std::nullopt; } -static int get_free_port() { -#ifdef _WIN32 - WSADATA wsaData; - if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) { - return -1; - } - typedef SOCKET native_socket_t; -#define INVALID_SOCKET_VAL INVALID_SOCKET -#define CLOSE_SOCKET(s) closesocket(s) -#else - typedef int native_socket_t; -#define INVALID_SOCKET_VAL -1 -#define CLOSE_SOCKET(s) close(s) -#endif - - native_socket_t sock = socket(AF_INET, SOCK_STREAM, 0); - if (sock == INVALID_SOCKET_VAL) { -#ifdef _WIN32 - WSACleanup(); -#endif - return -1; - } - - struct sockaddr_in serv_addr; - std::memset(&serv_addr, 0, sizeof(serv_addr)); - serv_addr.sin_family = AF_INET; - serv_addr.sin_addr.s_addr = htonl(INADDR_ANY); - serv_addr.sin_port = htons(0); - - if (bind(sock, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) != 0) { - CLOSE_SOCKET(sock); -#ifdef _WIN32 - WSACleanup(); -#endif - return -1; - } - -#ifdef _WIN32 - int namelen = sizeof(serv_addr); -#else - socklen_t namelen = sizeof(serv_addr); -#endif - if (getsockname(sock, (struct sockaddr*)&serv_addr, &namelen) != 0) { - CLOSE_SOCKET(sock); -#ifdef _WIN32 - WSACleanup(); -#endif - return -1; - } - - int port = ntohs(serv_addr.sin_port); - - CLOSE_SOCKET(sock); -#ifdef _WIN32 - WSACleanup(); -#endif - - return port; -} - // helper to convert vector to char ** // pointers are only valid as long as the original vector is valid static std::vector to_char_ptr_array(const std::vector & vec) { @@ -867,7 +801,7 @@ void server_models::load(const std::string & name, const load_options & opts) { // prepare new instance info instance_t inst; inst.meta = meta; - inst.meta.port = get_free_port(); + inst.meta.port = common_http_get_free_port(); inst.meta.status = SERVER_MODEL_STATUS_LOADING; inst.meta.loaded_info = json{}; inst.meta.last_used = ggml_time_ms(); From f7421eabe824ec09930a7081ae78c4f48dc1287c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 23 Jun 2026 13:28:14 +0200 Subject: [PATCH 2/9] wip --- tools/cli/CMakeLists.txt | 2 +- tools/cli/cli-context.cpp | 54 ++------------------------------------- tools/cli/cli-context.h | 2 +- tools/cli/cli-server.h | 20 ++++----------- tools/cli/cli.cpp | 2 +- tools/server/server.cpp | 13 +++++++--- 6 files changed, 20 insertions(+), 73 deletions(-) diff --git a/tools/cli/CMakeLists.txt b/tools/cli/CMakeLists.txt index 2fa648bf0c89..8449cdbaffcd 100644 --- a/tools/cli/CMakeLists.txt +++ b/tools/cli/CMakeLists.txt @@ -8,7 +8,7 @@ add_library(${TARGET} cli.cpp set_target_properties(${TARGET} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) target_include_directories(${TARGET} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ../server) -target_link_libraries(${TARGET} PUBLIC server-context llama-common ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PUBLIC llama-server-impl llama-common ${CMAKE_THREAD_LIBS_INIT}) if(LLAMA_TOOLS_INSTALL) install(TARGETS ${TARGET} LIBRARY) diff --git a/tools/cli/cli-context.cpp b/tools/cli/cli-context.cpp index f5af5ef3b846..dfc2c9f6b9da 100644 --- a/tools/cli/cli-context.cpp +++ b/tools/cli/cli-context.cpp @@ -41,56 +41,6 @@ static int arg_num_values(const common_arg & opt) { return 0; } -// keep only the args that llama-server understands, so that the remainder -// of the command line can be forwarded to the spawned server child -static std::vector filter_server_args(int argc, char ** argv) { - std::map cli_n_values; // arg -> number of values - std::set server_args; - - common_params dummy_cli; - auto ctx_cli = common_params_parser_init(dummy_cli, LLAMA_EXAMPLE_CLI); - for (const auto & opt : ctx_cli.options) { - for (const char * a : opt.args) { - cli_n_values[a] = arg_num_values(opt); - } - for (const char * a : opt.args_neg) { - cli_n_values[a] = 0; - } - } - - common_params dummy_server; - auto ctx_server = common_params_parser_init(dummy_server, LLAMA_EXAMPLE_SERVER); - for (const auto & opt : ctx_server.options) { - for (const char * a : opt.args) { - server_args.insert(a); - } - for (const char * a : opt.args_neg) { - server_args.insert(a); - } - } - - std::vector result; - for (int i = 1; i < argc; i++) { - const std::string arg = argv[i]; - auto it = cli_n_values.find(arg); - if (it == cli_n_values.end()) { - // not a known arg (should not happen when parsing succeeded) - continue; - } - const bool forward = server_args.count(arg) > 0; - if (forward) { - result.push_back(arg); - } - for (int j = 0; j < it->second && i + 1 < argc; j++) { - i++; - if (forward) { - result.push_back(argv[i]); - } - } - } - return result; -} - static std::string format_error_message(const json & err) { if (err.contains("error") && err.at("error").is_object()) { const auto & e = err.at("error"); @@ -113,7 +63,7 @@ static std::string media_type_from_ext(const std::string & fname) { return "image"; } -bool cli_context::init(int argc, char ** argv) { +bool cli_context::init() { std::optional spinner; if (!params.server_base.empty()) { @@ -138,7 +88,7 @@ bool cli_context::init(int argc, char ** argv) { spinner.emplace("Loading model..."); server.emplace(); - if (!server->start(filter_server_args(argc, argv))) { + if (!server->start(params)) { view::show_error("server start failed"); return false; } diff --git a/tools/cli/cli-context.h b/tools/cli/cli-context.h index ef55b2c9b470..cbf6729d6d4f 100644 --- a/tools/cli/cli-context.h +++ b/tools/cli/cli-context.h @@ -62,7 +62,7 @@ struct cli_context { // connect to --server-base or spawn a local llama-server child; // argc/argv are needed to forward the server-relevant args to the child - bool init(int argc, char ** argv); + bool init(); // run the interactive chat loop, returns the process exit code int run(); diff --git a/tools/cli/cli-server.h b/tools/cli/cli-server.h index 8e6e388ce38d..41f860af866c 100644 --- a/tools/cli/cli-server.h +++ b/tools/cli/cli-server.h @@ -8,7 +8,7 @@ // note: in the future, we may have a server running as daemon and the CLI can connect to it automatically // llama_server will be available as a dynamic library symbol -int llama_server(int argc, char ** argv); +int llama_server(common_params & params, int argc, char ** argv); struct cli_server { std::thread th; @@ -24,26 +24,16 @@ struct cli_server { } } - bool start(std::vector args) { + bool start(common_params & params) { port = common_http_get_free_port(); if (port <= 0) { fprintf(stderr, "failed to get a free port\n"); exit(1); } - th = std::thread([&, args_ = args]() { - auto args = args_; // copy to modify - args.push_back("--port"); - args.push_back(std::to_string(port)); - - // convert to char* array - std::vector argv; - for (auto & arg : args) { - argv.push_back(arg.data()); - } - argv.push_back(nullptr); - - int res = llama_server(static_cast(args.size()), argv.data()); + th = std::thread([&]() { + // argc / argv are only used in router mode, we can skip them for now + int res = llama_server(params, 0, nullptr); if (res != 0) { fprintf(stderr, "llama_server exited with code %d\n", res); } diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index f16f18294e97..2778b98c0bb6 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -182,7 +182,7 @@ int llama_cli(int argc, char ** argv) { cli_context ctx_cli(params); - if (!ctx_cli.init(argc, argv)) { + if (!ctx_cli.init()) { ctx_cli.shutdown(); return 1; } diff --git a/tools/server/server.cpp b/tools/server/server.cpp index dd4b1c507c83..b5902458c813 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -21,6 +21,12 @@ #include #endif +// satisfies -Wmissing-declarations (used by llama command) +int llama_server(int argc, char ** argv); + +// to be used via CLI (argc / argv are used by router mode only) +int llama_server(common_params & params, int argc, char ** argv); + static std::function shutdown_handler; static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT; @@ -71,9 +77,6 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t }; } -// satisfies -Wmissing-declarations -int llama_server(int argc, char ** argv); - int llama_server(int argc, char ** argv) { std::setlocale(LC_NUMERIC, "C"); @@ -89,6 +92,10 @@ int llama_server(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); + return llama_server(params, argc, argv); +} + +int llama_server(common_params & params, int argc, char ** argv) { // router server never loads a model and must not touch the GPU const bool is_router_server = params.model.path.empty() && params.model.hf_repo.empty(); From 19296c1735147a826d54eb9f53cab658b60449e7 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 23 Jun 2026 16:09:09 +0200 Subject: [PATCH 3/9] working --- tools/cli/cli-context.cpp | 52 ++++++++++++++++++++++----------------- tools/cli/cli-context.h | 26 +------------------- tools/cli/cli-server.h | 44 ++++++++++++++++++++++++--------- tools/cli/cli-view.h | 44 +++++++++++++++++++++++++-------- tools/server/main.cpp | 14 +++++++++++ tools/server/server.cpp | 50 +++++++++++++++++++++++-------------- 6 files changed, 142 insertions(+), 88 deletions(-) diff --git a/tools/cli/cli-context.cpp b/tools/cli/cli-context.cpp index dfc2c9f6b9da..cbfde0c0a36e 100644 --- a/tools/cli/cli-context.cpp +++ b/tools/cli/cli-context.cpp @@ -64,6 +64,8 @@ static std::string media_type_from_ext(const std::string & fname) { } bool cli_context::init() { + view::init(params); + std::optional spinner; if (!params.server_base.empty()) { @@ -85,7 +87,7 @@ bool cli_context::init() { return false; } - spinner.emplace("Loading model..."); + spinner.emplace("\n\nLoading model..."); server.emplace(); if (!server->start(params)) { @@ -281,35 +283,35 @@ int cli_context::run() { modalities += ", video"; } - std::vector banner; - banner.push_back("\n"); - banner.push_back(LLAMA_ASCII_LOGO); - banner.push_back("\n"); - banner.push_back("build : " + build_info); - banner.push_back("model : " + model_name); - banner.push_back("modalities : " + modalities); + std::string banner; + banner += "\n"; + banner += LLAMA_ASCII_LOGO; + banner += "\n"; + banner += "build : " + build_info + "\n"; + banner += "model : " + model_name + "\n"; + banner += "modalities : " + modalities + "\n"; if (!params.system_prompt.empty()) { - console::log("using custom system prompt\n"); + banner += "using custom system prompt\n"; } - console::log("\n"); - console::log("available commands:\n"); - console::log(" /exit or Ctrl+C stop or exit\n"); - console::log(" /regen regenerate the last response\n"); - console::log(" /clear clear the chat history\n"); - console::log(" /read add a text file\n"); - console::log(" /glob add text files using globbing pattern\n"); + banner += "\n"; + banner += "available commands:\n"; + banner += " /exit or Ctrl+C stop or exit\n"; + banner += " /regen regenerate the last response\n"; + banner += " /clear clear the chat history\n"; + banner += " /read add a text file\n"; + banner += " /glob add text files using globbing pattern\n"; if (has_vision) { - console::log(" /image add an image file\n"); + banner += " /image add an image file\n"; } if (has_audio) { - console::log(" /audio add an audio file\n"); + banner += " /audio add an audio file\n"; } if (has_video) { - console::log(" /video add a video file\n"); + banner += " /video add a video file\n"; } - console::log("\n"); + banner += "\n"; - view::show_banner(banner); + view::show_message(banner); // interactive loop std::string cur_msg; @@ -476,7 +478,11 @@ int cli_context::run() { }); if (params.show_timings) { - // TODO + view::show_info(string_format( + "\n[ Prompt: %.1f t/s | Generation: %.1f t/s ]", + timings.prompt_per_second, + timings.predicted_per_second + )); } if (params.single_turn) { @@ -484,7 +490,7 @@ int cli_context::run() { } } - view::show_message("Exiting..."); + view::show_message("\n\nExiting..."); return 0; } diff --git a/tools/cli/cli-context.h b/tools/cli/cli-context.h index cbf6729d6d4f..2c67586d638d 100644 --- a/tools/cli/cli-context.h +++ b/tools/cli/cli-context.h @@ -1,9 +1,3 @@ -// controller for llama-cli (the "controller" in MVC) -// -// owns the chat state, drives the view and talks to llama-server through -// cli_client; when no --server-base is given it also manages a local -// llama-server child process via cli_server - #pragma once #include "common.h" @@ -20,25 +14,6 @@ struct cli_timings { double predicted_per_second = 0.0; }; -struct cli_command_info { - std::string usage; // e.g. "/read " - std::string description; // e.g. "add a text file" -}; - -// properties of the connected server, shown on startup -struct cli_server_info { - std::string build_info; - std::string model_name; - std::string server_base; - bool is_local_server = false; // server is spawned and managed by llama-cli - bool has_system_prompt = false; - bool has_vision = false; - bool has_audio = false; - bool has_video = false; - - std::vector commands; -}; - // set by the SIGINT handler; cleared once the interrupt has been handled extern std::atomic g_cli_interrupted; @@ -52,6 +27,7 @@ struct cli_context { json pending_media = json::array(); // staged multimodal content parts // properties of the connected server + // will be populated by fetch_server_props() std::string model_name; std::string build_info; bool has_vision = false; diff --git a/tools/cli/cli-server.h b/tools/cli/cli-server.h index 41f860af866c..50447f255114 100644 --- a/tools/cli/cli-server.h +++ b/tools/cli/cli-server.h @@ -9,18 +9,22 @@ // llama_server will be available as a dynamic library symbol int llama_server(common_params & params, int argc, char ** argv); +void llama_server_terminate(); struct cli_server { std::thread th; int port = -1; + std::atomic is_alive = false; + std::atomic is_stopping = false; ~cli_server() { stop(); } void stop() { - if (th.joinable()) { - th.detach(); + if (alive() && !is_stopping.exchange(true)) { + llama_server_terminate(); + th.join(); } } @@ -31,12 +35,17 @@ struct cli_server { exit(1); } + is_alive.store(true, std::memory_order_release); + th = std::thread([&]() { + common_params server_params = params; // copy + server_params.port = port; // argc / argv are only used in router mode, we can skip them for now - int res = llama_server(params, 0, nullptr); + int res = llama_server(server_params, 0, nullptr); if (res != 0) { fprintf(stderr, "llama_server exited with code %d\n", res); } + is_alive.store(false, std::memory_order_release); }); return true; @@ -47,17 +56,30 @@ struct cli_server { } bool wait_ready(std::function should_stop) { - // while (true) { - // if (should_stop()) { - // break; - // } - // std::this_thread::sleep_for(std::chrono::milliseconds(5000)); - // } - std::this_thread::sleep_for(std::chrono::milliseconds(5000)); + if (!alive()) { + return false; + } + while (!should_stop()) { + auto [cli, parts] = common_http_client(address()); + cli.set_connection_timeout(1, 0); + auto res = cli.Get("/health"); + if (res) { + if (res->status == 200) { + return true; + } + // any other status means the server is up but not ready yet + // (e.g. 503 while the model is still loading) + } + if (!alive()) { + // in case server die permanently + return false; + } + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + } return true; } bool alive() const { - return th.joinable(); + return is_alive.load(std::memory_order_acquire); } }; diff --git a/tools/cli/cli-view.h b/tools/cli/cli-view.h index 6168f6ade3d5..a44a0ba240df 100644 --- a/tools/cli/cli-view.h +++ b/tools/cli/cli-view.h @@ -19,7 +19,9 @@ namespace view { struct spinner { spinner(const std::string & message) { - console::log("%s\n", message.c_str()); + if (!message.empty()) { + console::log("%s ", message.c_str()); + } console::spinner::start(); } ~spinner() { @@ -60,27 +62,49 @@ namespace view { }; struct assistant_turn { assistant_display_mode mode = ASSISTANT_DISPLAY_MODE_CONTENT; + bool trailing_newline = true; + bool is_inside_reasoning = false; assistant_turn() { console::set_display(DISPLAY_TYPE_RESET); } ~assistant_turn() { console::set_display(DISPLAY_TYPE_RESET); + add_newline_if_needed(); } void push(assistant_display_mode m, const std::string & buffer) { if (m != mode) { + add_newline_if_needed(); switch (m) { case ASSISTANT_DISPLAY_MODE_CONTENT: - console::set_display(DISPLAY_TYPE_RESET); - break; + { + if (is_inside_reasoning) { + console::log("[End thinking]\n\n"); + is_inside_reasoning = false; + } + console::set_display(DISPLAY_TYPE_RESET); + } break; case ASSISTANT_DISPLAY_MODE_REASONING: - console::set_display(DISPLAY_TYPE_REASONING); - break; + { + console::set_display(DISPLAY_TYPE_REASONING); + is_inside_reasoning = true; + console::log("\n[Start thinking]\n\n"); + } break; } } mode = m; + if (buffer.empty()) { + return; + } + trailing_newline = buffer.back() == '\n'; console::log("%s", buffer.c_str()); console::flush(); } + void add_newline_if_needed() { + if (!trailing_newline) { + console::log("\n"); + console::flush(); + } + } }; static void show_error(const std::string & title, const std::string & message = "") { @@ -95,9 +119,9 @@ namespace view { console::log("%s\n", message.c_str()); } - static void show_banner(const std::vector & lines) { - for (const auto & line : lines) { - console::log("%s\n", line.c_str()); - } + static void show_info(const std::string & message) { + console::set_display(DISPLAY_TYPE_INFO); + console::log("%s\n", message.c_str()); + console::set_display(DISPLAY_TYPE_RESET); } -}; +} diff --git a/tools/server/main.cpp b/tools/server/main.cpp index 7f17c56a8c29..b8d14e311133 100644 --- a/tools/server/main.cpp +++ b/tools/server/main.cpp @@ -3,3 +3,17 @@ int llama_server(int argc, char ** argv); int main(int argc, char ** argv) { return llama_server(argc, argv); } + +// satisfies -Wmissing-declarations +void server_signal_handler(int signal); + +void server_signal_handler(int signal) { + if (is_terminating.test_and_set()) { + // in case it hangs, we can force terminate the server by hitting Ctrl+C twice + // this is for better developer experience, we can remove when the server is stable enough + fprintf(stderr, "Received second interrupt, terminating immediately.\n"); + exit(1); + } + + shutdown_handler(signal); +} diff --git a/tools/server/server.cpp b/tools/server/server.cpp index a101df655d6b..3b55c5f4be22 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -21,12 +21,6 @@ #include #endif -// satisfies -Wmissing-declarations (used by llama command) -int llama_server(int argc, char ** argv); - -// to be used via CLI (argc / argv are used by router mode only) -int llama_server(common_params & params, int argc, char ** argv); - static std::function shutdown_handler; static std::atomic_flag is_terminating = ATOMIC_FLAG_INIT; @@ -41,6 +35,19 @@ static inline void signal_handler(int signal) { shutdown_handler(signal); } +// satisfies -Wmissing-declarations (used by llama command) +int llama_server(int argc, char ** argv); + +// to be used via CLI (argc / argv are used by router mode only) +int llama_server(common_params & params, int argc, char ** argv); +void llama_server_terminate(); +void llama_server_terminate() { + if (shutdown_handler) { + shutdown_handler(0); + } +} + + // wrapper function that handles exceptions and logs errors // this is to make sure handler_t never throws exceptions; instead, it returns an error response static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) { @@ -96,8 +103,10 @@ int llama_server(int argc, char ** argv) { } int llama_server(common_params & params, int argc, char ** argv) { + bool is_run_by_cli = (argv == nullptr); + // note: router mode also accepts -hf remote-preset, so we need to check that first - if (!params.model.hf_repo.empty()) { + if (!is_run_by_cli && !params.model.hf_repo.empty()) { try { common_params_handle_models_params handle_params; handle_params.preset_only = true; @@ -279,8 +288,9 @@ int llama_server(common_params & params, int argc, char ** argv) { if (child.is_child() && child.get_mode() == SERVER_CHILD_MODE_DOWNLOAD) { return child.run_download(params); - } else if (!is_router_server) { + } else if (!is_router_server && !is_run_by_cli) { // single-model mode (NOT spawned by router) + // if this is invoked by CLI, model downloading should already handled common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, {}); } @@ -363,20 +373,22 @@ int llama_server(common_params & params, int argc, char ** argv) { }; } - // TODO: refactor in common/console + // register signal handler is not running by CLI + if (!is_run_by_cli) { #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) - struct sigaction sigint_action; - sigint_action.sa_handler = signal_handler; - sigemptyset (&sigint_action.sa_mask); - sigint_action.sa_flags = 0; - sigaction(SIGINT, &sigint_action, NULL); - sigaction(SIGTERM, &sigint_action, NULL); + struct sigaction sigint_action; + sigint_action.sa_handler = signal_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); + sigaction(SIGTERM, &sigint_action, NULL); #elif defined (_WIN32) - auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { - return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; - }; - SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); #endif + } if (is_router_server) { SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str()); From 85c58bbcd098bdf8e0e92c697591a2e2fbd262b7 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 23 Jun 2026 16:19:28 +0200 Subject: [PATCH 4/9] remote server ok --- common/arg.cpp | 12 +++- common/common.h | 2 +- tools/cli/cli-view.h | 129 ++++++++++++++++++++++++++++++++++++++++-- tools/cli/cli.cpp | 124 ---------------------------------------- tools/server/main.cpp | 14 ----- 5 files changed, 134 insertions(+), 147 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 276dbec8bac5..2a20d6ae4fde 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -603,9 +603,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context // model is required (except for server) // TODO @ngxson : maybe show a list of available models in CLI in this case - if (params.model.path.empty() - && !params.usage - && !params.completion) { + bool can_skip_model = params.usage || params.completion || !params.server_base.empty(); + if (!can_skip_model && params.model.path.empty()) { throw std::invalid_argument("error: --model is required\n"); } } @@ -1119,6 +1118,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.completion = true; } )); + add_opt(common_arg( + {"--server-base"}, "URL", + string_format("connect to this server instead of starting a new one, example: 'http://localhost:8080' (default: none)"), + [](common_params & params, const std::string & value) { + params.server_base = value; + } + ).set_examples({LLAMA_EXAMPLE_CLI})); add_opt(common_arg( {"--verbose-prompt"}, string_format("print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false"), diff --git a/common/common.h b/common/common.h index 381c0306c3f9..203de5dcb56c 100644 --- a/common/common.h +++ b/common/common.h @@ -632,7 +632,7 @@ struct common_params { std::map default_template_kwargs; // CLI params - std::string server_base; + std::string server_base; // if set, connect to this server instead of starting a new one // UI configs bool ui = true; diff --git a/tools/cli/cli-view.h b/tools/cli/cli-view.h index a44a0ba240df..852ae7e73319 100644 --- a/tools/cli/cli-view.h +++ b/tools/cli/cli-view.h @@ -3,18 +3,137 @@ #include "common.h" #include "console.h" -// note: make this view implementation generic, so that we can move to TUI in the future if we want to -namespace view { - using completion_callback = std::function>(std::string_view, size_t)>; +#include +#include +#include +#include + +// TODO?: Make this reusable, enums, docs +static const std::array cmds = { + "/audio ", + "/clear", + "/exit", + "/glob ", + "/image ", + "/read ", + "/regen", + "/video ", +}; + +static std::vector> auto_completion_callback(std::string_view line, size_t cursor_byte_pos) { + std::vector> matches; + std::string cmd; + + if (line.length() > 1 && line.front() == '/' && !std::any_of(cmds.begin(), cmds.end(), [line](std::string_view prefix) { + return string_starts_with(line, prefix); + })) { + auto it = cmds.begin(); + + while ((it = std::find_if(it, cmds.end(), [line](std::string_view cmd_line) { + return string_starts_with(cmd_line, line); + })) != cmds.end()) { + matches.emplace_back(*it, it->length()); + ++it; + } + } else { + auto it = std::find_if(cmds.begin(), cmds.end(), [line](std::string_view prefix) { + return prefix.back() == ' ' && string_starts_with(line, prefix); + }); + + if (it != cmds.end()) { + cmd = *it; + } + } + + if (!cmd.empty() && cmd != "/glob " && line.length() >= cmd.length() && cursor_byte_pos >= cmd.length()) { + const std::string path_prefix = std::string(line.substr(cmd.length(), cursor_byte_pos - cmd.length())); + const std::string path_postfix = std::string(line.substr(cursor_byte_pos)); + auto cur_dir = std::filesystem::current_path(); + std::string cur_dir_str = cur_dir.string(); + std::string expanded_prefix = path_prefix; + +#if !defined(_WIN32) + if (string_starts_with(path_prefix, '~')) { + const char * home = std::getenv("HOME"); + if (home && home[0]) { + expanded_prefix = home + path_prefix.substr(1); + } + } + if (string_starts_with(expanded_prefix, '/')) { +#else + if (std::isalpha(expanded_prefix[0]) && expanded_prefix.find(':') == 1) { +#endif + cur_dir = std::filesystem::path(expanded_prefix).parent_path(); + cur_dir_str.clear(); + } else if (!path_prefix.empty()) { + cur_dir /= std::filesystem::path(path_prefix).parent_path(); + } + + std::error_code ec; + for (const auto & entry : std::filesystem::directory_iterator(cur_dir, ec)) { + if (ec) { + break; + } + if (!entry.exists(ec)) { + ec.clear(); + continue; + } + + const std::string path_full = entry.path().string(); + std::string path_entry = !cur_dir_str.empty() && string_starts_with(path_full, cur_dir_str) ? path_full.substr(cur_dir_str.length() + 1) : path_full; - static void set_completion_callback(completion_callback cb) { - console::set_completion_callback(std::move(cb)); + if (entry.is_directory(ec)) { + path_entry.push_back(std::filesystem::path::preferred_separator); + } + + if (expanded_prefix.empty() || string_starts_with(path_entry, expanded_prefix)) { + const std::string updated_line = cmd + path_entry; + matches.emplace_back(updated_line + path_postfix, updated_line.length()); + } + + if (ec) { + ec.clear(); + } + } + + if (matches.empty()) { + const std::string updated_line = cmd + path_prefix; + matches.emplace_back(updated_line + path_postfix, updated_line.length()); + } + + // Add the longest common prefix + if (!expanded_prefix.empty() && matches.size() > 1) { + const std::string_view match0(matches[0].first); + const std::string_view match1(matches[1].first); + auto it = std::mismatch(match0.begin(), match0.end(), match1.begin(), match1.end()); + size_t len = it.first - match0.begin(); + + for (size_t i = 2; i < matches.size(); ++i) { + const std::string_view matchi(matches[i].first); + auto cmp = std::mismatch(match0.begin(), match0.end(), matchi.begin(), matchi.end()); + len = std::min(len, static_cast(cmp.first - match0.begin())); + } + + const std::string updated_line = std::string(match0.substr(0, len)); + matches.emplace_back(updated_line + path_postfix, updated_line.length()); + } + + std::sort(matches.begin(), matches.end(), [](const auto & a, const auto & b) { + return a.first.compare(0, a.second, b.first, 0, b.second) < 0; + }); } + return matches; +} + +// note: make this view implementation generic, so that we can move to TUI in the future if we want to +namespace view { static void init(const common_params & params) { // TODO: avoid using atexit() here by making `console` a singleton console::init(params.simple_io, params.use_color); atexit([]() { console::cleanup(); }); + + console::set_completion_callback(auto_completion_callback); } struct spinner { diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 2778b98c0bb6..e832f903e500 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -5,10 +5,6 @@ #include "cli-context.h" #include "cli-view.h" -#include -#include -#include -#include #include #if defined(_WIN32) @@ -32,124 +28,6 @@ static void signal_handler(int) { } #endif -// TODO?: Make this reusable, enums, docs -static const std::array cmds = { - "/audio ", - "/clear", - "/exit", - "/glob ", - "/image ", - "/read ", - "/regen", - "/video ", -}; - -static std::vector> auto_completion_callback(std::string_view line, size_t cursor_byte_pos) { - std::vector> matches; - std::string cmd; - - if (line.length() > 1 && line.front() == '/' && !std::any_of(cmds.begin(), cmds.end(), [line](std::string_view prefix) { - return string_starts_with(line, prefix); - })) { - auto it = cmds.begin(); - - while ((it = std::find_if(it, cmds.end(), [line](std::string_view cmd_line) { - return string_starts_with(cmd_line, line); - })) != cmds.end()) { - matches.emplace_back(*it, it->length()); - ++it; - } - } else { - auto it = std::find_if(cmds.begin(), cmds.end(), [line](std::string_view prefix) { - return prefix.back() == ' ' && string_starts_with(line, prefix); - }); - - if (it != cmds.end()) { - cmd = *it; - } - } - - if (!cmd.empty() && cmd != "/glob " && line.length() >= cmd.length() && cursor_byte_pos >= cmd.length()) { - const std::string path_prefix = std::string(line.substr(cmd.length(), cursor_byte_pos - cmd.length())); - const std::string path_postfix = std::string(line.substr(cursor_byte_pos)); - auto cur_dir = std::filesystem::current_path(); - std::string cur_dir_str = cur_dir.string(); - std::string expanded_prefix = path_prefix; - -#if !defined(_WIN32) - if (string_starts_with(path_prefix, '~')) { - const char * home = std::getenv("HOME"); - if (home && home[0]) { - expanded_prefix = home + path_prefix.substr(1); - } - } - if (string_starts_with(expanded_prefix, '/')) { -#else - if (std::isalpha(expanded_prefix[0]) && expanded_prefix.find(':') == 1) { -#endif - cur_dir = std::filesystem::path(expanded_prefix).parent_path(); - cur_dir_str.clear(); - } else if (!path_prefix.empty()) { - cur_dir /= std::filesystem::path(path_prefix).parent_path(); - } - - std::error_code ec; - for (const auto & entry : std::filesystem::directory_iterator(cur_dir, ec)) { - if (ec) { - break; - } - if (!entry.exists(ec)) { - ec.clear(); - continue; - } - - const std::string path_full = entry.path().string(); - std::string path_entry = !cur_dir_str.empty() && string_starts_with(path_full, cur_dir_str) ? path_full.substr(cur_dir_str.length() + 1) : path_full; - - if (entry.is_directory(ec)) { - path_entry.push_back(std::filesystem::path::preferred_separator); - } - - if (expanded_prefix.empty() || string_starts_with(path_entry, expanded_prefix)) { - const std::string updated_line = cmd + path_entry; - matches.emplace_back(updated_line + path_postfix, updated_line.length()); - } - - if (ec) { - ec.clear(); - } - } - - if (matches.empty()) { - const std::string updated_line = cmd + path_prefix; - matches.emplace_back(updated_line + path_postfix, updated_line.length()); - } - - // Add the longest common prefix - if (!expanded_prefix.empty() && matches.size() > 1) { - const std::string_view match0(matches[0].first); - const std::string_view match1(matches[1].first); - auto it = std::mismatch(match0.begin(), match0.end(), match1.begin(), match1.end()); - size_t len = it.first - match0.begin(); - - for (size_t i = 2; i < matches.size(); ++i) { - const std::string_view matchi(matches[i].first); - auto cmp = std::mismatch(match0.begin(), match0.end(), matchi.begin(), matchi.end()); - len = std::min(len, static_cast(cmp.first - match0.begin())); - } - - const std::string updated_line = std::string(match0.substr(0, len)); - matches.emplace_back(updated_line + path_postfix, updated_line.length()); - } - - std::sort(matches.begin(), matches.end(), [](const auto & a, const auto & b) { - return a.first.compare(0, a.second, b.first, 0, b.second) < 0; - }); - } - - return matches; -} - // satisfies -Wmissing-declarations int llama_cli(int argc, char ** argv); @@ -164,8 +42,6 @@ int llama_cli(int argc, char ** argv) { return 1; } - view::set_completion_callback(auto_completion_callback); - #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; sigint_action.sa_handler = signal_handler; diff --git a/tools/server/main.cpp b/tools/server/main.cpp index b8d14e311133..7f17c56a8c29 100644 --- a/tools/server/main.cpp +++ b/tools/server/main.cpp @@ -3,17 +3,3 @@ int llama_server(int argc, char ** argv); int main(int argc, char ** argv) { return llama_server(argc, argv); } - -// satisfies -Wmissing-declarations -void server_signal_handler(int signal); - -void server_signal_handler(int signal) { - if (is_terminating.test_and_set()) { - // in case it hangs, we can force terminate the server by hitting Ctrl+C twice - // this is for better developer experience, we can remove when the server is stable enough - fprintf(stderr, "Received second interrupt, terminating immediately.\n"); - exit(1); - } - - shutdown_handler(signal); -} From 1401fc3ca7d1e9965d6c8e5ff71b49ed156061b5 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 23 Jun 2026 16:39:59 +0200 Subject: [PATCH 5/9] cli support router mode Co-authored-by: Piotr Wilkin --- tools/cli/cli-client.cpp | 29 ++++++++++++++++++++--- tools/cli/cli-client.h | 4 ++++ tools/cli/cli-context.cpp | 50 ++++++++++++++++++++++++++++++++++++++- tools/cli/cli-context.h | 4 ++++ tools/cli/cli-view.h | 8 +++++-- 5 files changed, 89 insertions(+), 6 deletions(-) diff --git a/tools/cli/cli-client.cpp b/tools/cli/cli-client.cpp index d45affba931c..946282185f2e 100644 --- a/tools/cli/cli-client.cpp +++ b/tools/cli/cli-client.cpp @@ -28,7 +28,8 @@ static std::string join_path(const common_http_url & parts, const std::string & json cli_client::get(const std::string & path) { auto [cli, parts] = common_http_client(server_base); cli.set_read_timeout(CLI_HTTP_READ_TIMEOUT_SEC, 0); - auto res = cli.Get(join_path(parts, path)); + auto path_with_model = path + (model.empty() ? "" : ("?model=" + model)); + auto res = cli.Get(join_path(parts, path_with_model)); if (!res) { throw std::runtime_error("failed to connect to " + server_base + ": " + httplib::to_string(res.error())); } @@ -45,7 +46,11 @@ json cli_client::get(const std::string & path) { json cli_client::post(const std::string & path, const json & body) { auto [cli, parts] = common_http_client(server_base); cli.set_read_timeout(CLI_HTTP_READ_TIMEOUT_SEC, 0); - auto res = cli.Post(join_path(parts, path), body.dump(), "application/json"); + auto body_with_model = body; + if (!model.empty()) { + body_with_model["model"] = model; + } + auto res = cli.Post(join_path(parts, path), body_with_model.dump(), "application/json"); if (!res) { throw std::runtime_error("failed to connect to " + server_base + ": " + httplib::to_string(res.error())); } @@ -100,7 +105,11 @@ json cli_client::post_sse(const std::string & path, }; httplib::Headers headers = {{"Accept", "text/event-stream"}}; - auto res = cli.Post(join_path(parts, path), headers, body.dump(), "application/json", receiver); + auto body_with_model = body; + if (!model.empty()) { + body_with_model["model"] = model; + } + auto res = cli.Post(join_path(parts, path), headers, body_with_model.dump(), "application/json", receiver); if (!res) { if (res.error() == httplib::Error::Canceled && should_stop()) { @@ -139,3 +148,17 @@ bool cli_client::wait_health(const std::function & is_aborted) { last_error = "aborted while waiting for the server to become ready"; return false; } + +std::vector cli_client::list_models() { + json resp = get("/v1/models"); + if (!resp.contains("data") || !resp.at("data").is_array()) { + throw std::runtime_error("invalid response from /v1/models"); + } + std::vector models; + for (const auto & m : resp.at("data")) { + if (m.contains("id") && m.at("id").is_string()) { + models.push_back(m.at("id").get()); + } + } + return models; +} diff --git a/tools/cli/cli-client.h b/tools/cli/cli-client.h index 463deebf08f1..1bf9adaf5488 100644 --- a/tools/cli/cli-client.h +++ b/tools/cli/cli-client.h @@ -15,6 +15,8 @@ struct cli_client { std::string server_base; // base url, for example "http://127.0.0.1:8080" std::string last_error; // set when wait_health() fails + std::string model; // optional, set when the server has multiple models (router mode) + // simple GET request, returns the response json // throws std::runtime_error on transport error or non-2xx status json get(const std::string & path); @@ -49,4 +51,6 @@ struct cli_client { json get_props() { return get("/props"); } + + std::vector list_models(); }; diff --git a/tools/cli/cli-context.cpp b/tools/cli/cli-context.cpp index cbfde0c0a36e..9d906ccbaba1 100644 --- a/tools/cli/cli-context.cpp +++ b/tools/cli/cli-context.cpp @@ -68,7 +68,8 @@ bool cli_context::init() { std::optional spinner; - if (!params.server_base.empty()) { + bool use_external_server = !params.server_base.empty(); + if (use_external_server) { std::string base = params.server_base; while (!base.empty() && base.back() == '/') { base.pop_back(); @@ -121,6 +122,15 @@ bool cli_context::init() { return false; } + if (use_external_server) { + spinner.reset(); + if (!list_and_ask_models()) { + return false; + } + // restore the spinner for the next step + spinner.emplace("Waiting for server..."); + } + fetch_server_props(); return true; @@ -149,6 +159,44 @@ void cli_context::fetch_server_props() { } } +bool cli_context::list_and_ask_models() { + auto models = client.list_models(); + std::string message = "\nAvailable models:"; + if (!models.empty()) { + for (size_t i = 0; i < models.size(); ++i) { + message += "\n " + std::to_string(i + 1) + ". " + models[i]; + } + } + message += "\n"; + view::show_message(message); + std::string selection; + while (selection.empty()) { + if (should_stop()) { + return false; + } + view::user_turn user_turn; + selection = user_turn.read_input(false, "Select model by number: "); + if (selection.empty()) { + continue; + } + try { + size_t idx = std::stoul(selection); + if (idx > 0 && idx <= models.size()) { + model_name = models[idx - 1]; + client.model = model_name; + view::show_message("Selected model: " + model_name); + break; + } + } catch (...) { + // ignore + } + view::show_error("Invalid selection. Please enter a valid number."); + selection.clear(); + continue; + } + return true; +} + void cli_context::add_system_prompt() { if (!params.system_prompt.empty()) { messages.push_back({ diff --git a/tools/cli/cli-context.h b/tools/cli/cli-context.h index 2c67586d638d..99c65cdac480 100644 --- a/tools/cli/cli-context.h +++ b/tools/cli/cli-context.h @@ -52,6 +52,10 @@ struct cli_context { void add_system_prompt(); void push_user_message(const std::string & text); + // check if server have multiple models (router mode) + // if yes, list them then ask; do nothing otherwise + bool list_and_ask_models(); + // read a file and stage it as a multimodal content part; type is one of // "image", "audio", "video"; returns false if the file cannot be read bool stage_media_file(const std::string & fname, const std::string & type); diff --git a/tools/cli/cli-view.h b/tools/cli/cli-view.h index 852ae7e73319..4822d27b66b5 100644 --- a/tools/cli/cli-view.h +++ b/tools/cli/cli-view.h @@ -162,8 +162,12 @@ namespace view { console::log("\n> %s\n", buffer.c_str()); } } - std::string read_input(bool multiline_input) { - console::log("\n> "); + std::string read_input(bool multiline_input, const char * prompt = nullptr) { + if (prompt) { + console::log("%s", prompt); + } else { + console::log("\n> "); + } std::string buffer; std::string line; bool another_line = true; From b093e468732fd8d9582792bf6fee6550b3028455 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 23 Jun 2026 16:47:30 +0200 Subject: [PATCH 6/9] case: router with only one model --- tools/cli/cli-context.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/cli/cli-context.cpp b/tools/cli/cli-context.cpp index 9d906ccbaba1..3c8ddd478e05 100644 --- a/tools/cli/cli-context.cpp +++ b/tools/cli/cli-context.cpp @@ -161,6 +161,14 @@ void cli_context::fetch_server_props() { bool cli_context::list_and_ask_models() { auto models = client.list_models(); + + // only one model: use it without asking + if (models.size() == 1) { + model_name = models[0]; + client.model = model_name; + return true; + } + std::string message = "\nAvailable models:"; if (!models.empty()) { for (size_t i = 0; i < models.size(); ++i) { From beef5cf077e6e967fee8eb302665988ae530864f Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Tue, 23 Jun 2026 22:48:04 +0200 Subject: [PATCH 7/9] Apply suggestions from code review Co-authored-by: Piotr Wilkin (ilintar) --- tools/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 3b55c5f4be22..54d960a63b94 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -290,7 +290,7 @@ int llama_server(common_params & params, int argc, char ** argv) { return child.run_download(params); } else if (!is_router_server && !is_run_by_cli) { // single-model mode (NOT spawned by router) - // if this is invoked by CLI, model downloading should already handled + // if this is invoked by CLI, model downloading should be already handled common_params_handle_models(params, LLAMA_EXAMPLE_SERVER, {}); } @@ -373,7 +373,7 @@ int llama_server(common_params & params, int argc, char ** argv) { }; } - // register signal handler is not running by CLI + // register signal handler if not running by CLI if (!is_run_by_cli) { #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; From 5d67f69f59e8f2d8e2fa5cd8d22d5f091f1f1db5 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 23 Jun 2026 22:49:40 +0200 Subject: [PATCH 8/9] remove outdated comment --- tools/cli/cli-server.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/cli/cli-server.h b/tools/cli/cli-server.h index 50447f255114..d058c8418380 100644 --- a/tools/cli/cli-server.h +++ b/tools/cli/cli-server.h @@ -4,9 +4,6 @@ #include "http.h" -// spawn llama-server in a thread and interact with it via a random port -// note: in the future, we may have a server running as daemon and the CLI can connect to it automatically - // llama_server will be available as a dynamic library symbol int llama_server(common_params & params, int argc, char ** argv); void llama_server_terminate(); @@ -28,6 +25,7 @@ struct cli_server { } } + // spawn llama-server in a thread and interact with it via a random port bool start(common_params & params) { port = common_http_get_free_port(); if (port <= 0) { From a432e6f8633624ef9f9c319d017caf0e21e839b4 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 23 Jun 2026 22:57:20 +0200 Subject: [PATCH 9/9] use destructor instead --- tools/cli/cli-context.h | 3 +++ tools/cli/cli.cpp | 7 +------ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tools/cli/cli-context.h b/tools/cli/cli-context.h index 99c65cdac480..775895cce2c4 100644 --- a/tools/cli/cli-context.h +++ b/tools/cli/cli-context.h @@ -35,6 +35,9 @@ struct cli_context { bool has_video = false; cli_context(const common_params & params) : params(params) {} + ~cli_context() { + shutdown(); + } // connect to --server-base or spawn a local llama-server child; // argc/argv are needed to forward the server-relevant args to the child diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index e832f903e500..71a4828ad7dc 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -59,13 +59,8 @@ int llama_cli(int argc, char ** argv) { cli_context ctx_cli(params); if (!ctx_cli.init()) { - ctx_cli.shutdown(); return 1; } - int ret = ctx_cli.run(); - - ctx_cli.shutdown(); - - return ret; + return ctx_cli.run(); }