diff --git a/common/common.cpp b/common/common.cpp index b6a7626f2a1d..055772d711fe 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1312,6 +1312,12 @@ std::vector & common_init_result::lora() { } common_init_result_ptr common_init_from_params(common_params & params, bool model_only) { + // report the load phase up front (router progress UI); fit + metadata read emit nothing, + // so without this the UI sticks on "download 100%" until the per-tensor callback starts + if (params.load_stage_callback) { + params.load_stage_callback(COMMON_LOAD_STAGE_LOAD, -1.0f, params.load_stage_callback_user_data); + } + common_init_result_ptr res(new common_init_result(params, model_only)); llama_model * model = res->model(); @@ -1387,6 +1393,10 @@ common_init_result_ptr common_init_from_params(common_params & params, bool mode } if (params.warmup) { + // report the warmup phase (router progress UI) + if (params.load_stage_callback) { + params.load_stage_callback(COMMON_LOAD_STAGE_WARMUP, -1.0f, params.load_stage_callback_user_data); + } LOG_INF("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); std::vector tmp; diff --git a/common/common.h b/common/common.h index 13f387271d81..067e443fe01a 100644 --- a/common/common.h +++ b/common/common.h @@ -423,6 +423,15 @@ struct lr_opt { struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata); +// load-stage names for common_load_stage_callback; keep the webui switch in sync (getModelLoadPhase in models/utils.ts) +#define COMMON_LOAD_STAGE_DOWNLOAD "download" // file download (router child only) +#define COMMON_LOAD_STAGE_LOAD "load" // tensor loading +#define COMMON_LOAD_STAGE_WARMUP "warmup" // empty-run warmup +#define COMMON_LOAD_STAGE_FINALIZE "finalize" // post-warmup setup: chat templates / seq-rm tests (context alloc already happened during load) + +// coarse load-stage reporting for router mode; progress in [0,1], or <0 if indeterminate +typedef void (*common_load_stage_callback)(const char * stage, float progress, void * user_data); + struct common_params { int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit int32_t n_ctx = 0; // context size, 0 == context the model was trained with @@ -701,6 +710,9 @@ struct common_params { // return false from callback to abort model loading or true to continue llama_progress_callback load_progress_callback = NULL; void * load_progress_callback_user_data = NULL; + // optional callback for coarse load-stage reporting (used by router mode to drive a progress UI) + common_load_stage_callback load_stage_callback = NULL; + void * load_stage_callback_user_data = NULL; bool no_alloc = false; // Don't allocate model buffers }; diff --git a/common/download.cpp b/common/download.cpp index 40f6eb780f41..4f5affc1b742 100644 --- a/common/download.cpp +++ b/common/download.cpp @@ -475,17 +475,38 @@ std::pair> common_remote_get_content(const std::string return { res->status, std::move(buf) }; } +static common_download_callback * g_default_download_callback = nullptr; + +void common_download_set_default_callback(common_download_callback * callback) { + g_default_download_callback = callback; +} + int common_download_file_single(const std::string & url, const std::string & path, const common_download_opts & opts, bool skip_etag) { - if (!opts.offline) { + // resolve the effective callback: per-call > process-wide default + common_download_opts eff = opts; + if (!eff.callback) { + eff.callback = g_default_download_callback; + } + + if (!eff.offline) { ProgressBar tty_cb; - common_download_opts online_opts = opts; - if (!online_opts.callback) { - online_opts.callback = &tty_cb; + if (!eff.callback) { + eff.callback = &tty_cb; } - return common_download_file_single_online(url, path, online_opts, skip_etag); + const int status = common_download_file_single_online(url, path, eff, skip_etag); + // the online path returns 304 (cached, not modified) before emitting any callback; + // surface a cached start/done pair so aggregators still see every file exactly once + if (status == 304 && eff.callback) { + common_download_progress p; + p.url = url; + p.cached = true; + eff.callback->on_start(p); + eff.callback->on_done(p, true); + } + return status; } if (!std::filesystem::exists(path)) { @@ -496,12 +517,12 @@ int common_download_file_single(const std::string & url, LOG_DBG("%s: using cached file (offline mode): %s\n", __func__, path.c_str()); // notify the callback that the file was cached - if (opts.callback) { + if (eff.callback) { common_download_progress p; p.url = url; p.cached = true; - opts.callback->on_start(p); - opts.callback->on_done(p, true); + eff.callback->on_start(p); + eff.callback->on_done(p, true); } return 304; // Not Modified - fake cached response @@ -814,6 +835,13 @@ common_download_model_result common_download_model(const common_params_model & return result; } + // announce the full file set up front so a progress aggregator can form a stable + // denominator (multi-part GGUFs download in parallel below). use the effective callback, + // mirroring common_download_file_single's per-call > process-wide resolution. + if (common_download_callback * cb = opts.callback ? opts.callback : g_default_download_callback) { + cb->on_plan(tasks.size()); + } + std::vector> futures; for (const auto & task : tasks) { futures.push_back(std::async(std::launch::async, diff --git a/common/download.h b/common/download.h index ebeedd6058c7..a9c76c1a7634 100644 --- a/common/download.h +++ b/common/download.h @@ -18,12 +18,20 @@ struct common_download_progress { class common_download_callback { public: virtual ~common_download_callback() = default; + // called once before any file starts, with the number of files about to be downloaded; + // lets aggregators know the full set up front (e.g. multi-part GGUFs) instead of discovering + // files lazily as their callbacks fire. optional: default no-op. + virtual void on_plan(size_t total_files) { (void) total_files; } virtual void on_start(const common_download_progress & p) = 0; virtual void on_update(const common_download_progress & p) = 0; virtual void on_done(const common_download_progress & p, bool ok) = 0; virtual bool is_cancelled() const { return false; } }; +// process-wide default download callback, used when common_download_opts::callback is unset (nullptr to clear). +// borrowed, not owned: must outlive any download that uses it. +void common_download_set_default_callback(common_download_callback * callback); + struct common_remote_params { common_header_list headers; long timeout = 0; // in seconds, 0 means no timeout diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 07759f417084..8d86e226bd48 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1004,6 +1004,12 @@ struct server_context_impl { } } + // init done (weights + warmup); mark finalize so the UI doesn't stick on the warmup phase + // while the post-warmup setup below (chat templates / seq-rm tests) runs + if (params_base.load_stage_callback) { + params_base.load_stage_callback(COMMON_LOAD_STAGE_FINALIZE, -1.0f, params_base.load_stage_callback_user_data); + } + if (!llama_memory_can_shift(llama_get_memory(ctx_tgt))) { if (params_base.ctx_shift) { params_base.ctx_shift = false; diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 49b0e423f462..226010d7be92 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include #include @@ -46,6 +48,9 @@ extern char **environ; #define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready" // also sent when waking up from sleep #define CMD_CHILD_TO_ROUTER_SLEEP "cmd_child_to_router:sleep" #define CMD_CHILD_TO_ROUTER_INFO "cmd_child_to_router:info:" // followed by json string +// load stage report: "" or ":" (no fraction => indeterminate stage) +// stages are the COMMON_LOAD_STAGE_* names (download / load / warmup / finalize) +#define CMD_CHILD_TO_ROUTER_STAGE "cmd_child_to_router:stage:" // address for child process, this is needed because router may run on 0.0.0.0 // ref: https://github.com/ggml-org/llama.cpp/issues/17862 @@ -762,9 +767,11 @@ void server_models::load(const std::string & name) { instance_t inst; inst.meta = meta; inst.meta.port = get_free_port(); - inst.meta.status = SERVER_MODEL_STATUS_LOADING; - inst.meta.loaded_info = json{}; - inst.meta.last_used = ggml_time_ms(); + inst.meta.status = SERVER_MODEL_STATUS_LOADING; + inst.meta.loaded_info = json{}; + inst.meta.load_stage = ""; // reset stale stage/progress from a previous load + inst.meta.load_progress = -1.0f; + inst.meta.last_used = ggml_time_ms(); if (inst.meta.port <= 0) { throw std::runtime_error("failed to get a port number"); @@ -821,6 +828,16 @@ void server_models::load(const std::string & name) { this->update_loaded_info(name, str); } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) { this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0); + } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_STAGE)) { + std::string payload = string_strip(str.substr(strlen(CMD_CHILD_TO_ROUTER_STAGE))); + std::string stage = payload; + float progress = -1.0f; + auto colon = payload.find(':'); + if (colon != std::string::npos) { + stage = payload.substr(0, colon); + progress = strtof(payload.c_str() + colon + 1, nullptr); + } + this->update_stage(name, stage, progress); } } } else { @@ -985,6 +1002,16 @@ void server_models::update_loaded_info(const std::string & name, std::string & r cv.notify_all(); } +void server_models::update_stage(const std::string & name, const std::string & stage, float progress) { + std::unique_lock lk(mutex); + auto it = mapping.find(name); + if (it != mapping.end()) { + it->second.meta.load_stage = stage; + it->second.meta.load_progress = progress; + } + cv.notify_all(); +} + void server_models::wait_until_loading_finished(const std::string & name) { std::unique_lock lk(mutex); cv.wait(lk, [this, &name]() { @@ -1106,6 +1133,112 @@ void server_models::notify_router_sleeping_state(bool is_sleeping) { common_log_resume(common_log_main()); } +void server_models::notify_router_stage(const char * stage, float progress) { + // write in a single fputs to avoid interleaving with loader logging on the shared stdout + char line[96]; + if (progress >= 0.0f) { + snprintf(line, sizeof(line), "%s%s:%.4f\n", CMD_CHILD_TO_ROUTER_STAGE, stage, progress); + } else { + snprintf(line, sizeof(line), "%s%s\n", CMD_CHILD_TO_ROUTER_STAGE, stage); + } + common_log_pause(common_log_main()); + fflush(stdout); + fputs(line, stdout); + fflush(stdout); + common_log_resume(common_log_main()); +} + +// funnels all stage emissions to the router. progress callbacks fire per tensor/chunk (hundreds of +// times), so only forward on phase change, integer-percent advance, or completion. +struct stage_emitter { + std::string last_stage; + int last_pct = -1; + + void emit(const char * stage, float progress) { + const int pct = (int) (progress * 100.0f); + if (last_stage != stage || pct != last_pct || progress >= 1.0f) { + last_stage = stage; + last_pct = pct; + server_models::notify_router_stage(stage, progress); + } + } +}; + +// single model per child; emission sources don't overlap in time (download at arg-parse, then +// single-threaded load/warmup/finalize), and the download callback serializes its part threads. +static stage_emitter g_stage_emitter; + +bool server_models::child_load_progress_callback(float progress, void * /*user_data*/) { + g_stage_emitter.emit(COMMON_LOAD_STAGE_LOAD, progress); + return true; // never abort loading +} + +void server_models::child_load_stage_callback(const char * stage, float progress, void * /*user_data*/) { + g_stage_emitter.emit(stage, progress); // coarse phase markers (e.g. warmup / finalize) +} + +// forwards download progress to the router. multi-part GGUFs download parts in parallel, each with +// its own byte counts, so aggregate across parts for a whole-model 0->1 instead of one racing part. +// +// the per-file total arrives lazily (from each file's HEAD), so a naive sum over only-seen files has +// a growing denominator and the percent regresses (e.g. one part hits 100% before the rest register). +// to avoid that, stay indeterminate until every expected file is resolved (size known, or finished +// for a cached/unknown-size file); only then is the denominator stable and the percent monotonic. +struct child_download_progress_callback : common_download_callback { + void on_plan(size_t total_files) override { + std::lock_guard lock(mutex); + // fresh accounting per download pass (defensive: child loads one model, but a validation + // pass could call this twice) so a stale denominator can't leak across passes + files.clear(); + resolved.clear(); + expected = total_files; + } + void on_start(const common_download_progress & p) override { record(p, /*done=*/false); } + void on_update(const common_download_progress & p) override { record(p, /*done=*/false); } + void on_done(const common_download_progress & p, bool /*ok*/) override { record(p, /*done=*/true); } + +private: + std::mutex mutex; + std::unordered_map> files; // url -> {downloaded, total} + std::unordered_set resolved; // urls with a known size or finished + size_t expected = 0; // total files, from on_plan + + void record(const common_download_progress & p, bool done) { + // serializing here also protects g_stage_emitter's state from the parallel part threads + std::lock_guard lock(mutex); + + if (p.total > 0) { + files[p.url] = { p.downloaded, p.total }; + resolved.insert(p.url); + } else if (done) { + // cached / unknown-size file finished: count it resolved so we don't wait forever + files.emplace(p.url, std::make_pair(0, 0)); + resolved.insert(p.url); + } + + // until every file's size is known the denominator keeps growing; report indeterminate so + // the UI shows an animated download phase instead of a percentage that jumps backwards + if (expected == 0 || resolved.size() < expected) { + g_stage_emitter.emit(COMMON_LOAD_STAGE_DOWNLOAD, -1.0f); + return; + } + + size_t downloaded = 0; + size_t total = 0; + for (const auto & f : files) { + downloaded += f.second.first; + total += f.second.second; + } + // total == 0 means every file was cached/unknown-size: nothing to transfer, report complete + g_stage_emitter.emit(COMMON_LOAD_STAGE_DOWNLOAD, total > 0 ? (float) downloaded / (float) total : 1.0f); + } +}; + +void server_models::register_child_download_progress() { + static child_download_progress_callback cb; + common_download_set_default_callback(&cb); +} + // // server_models_routes @@ -1238,6 +1371,14 @@ void server_models_routes::init_routes() { {"value", server_model_status_to_string(meta.status)}, {"args", meta.args}, }; + if (meta.status == SERVER_MODEL_STATUS_LOADING) { + if (!meta.load_stage.empty()) { + status["stage"] = meta.load_stage; + } + if (meta.load_progress >= 0.0f) { + status["progress"] = meta.load_progress; + } + } if (!meta.preset.name.empty()) { common_preset preset_copy = meta.preset; unset_reserved_args(preset_copy, false); diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 2198589a7aa2..11b0fa80773b 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -68,6 +68,8 @@ struct server_model_meta { int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown mtmd_caps multimodal; // multimodal capabilities bool need_download = false; // whether the model needs to be downloaded before loading + std::string load_stage = ""; // current load stage ("download"/"load"/"warmup"/"finalize"), valid while status == LOADING + float load_progress = -1.0f; // progress 0..1 within the current stage, or <0 if indeterminate bool is_ready() const { return status == SERVER_MODEL_STATUS_LOADED; @@ -150,6 +152,7 @@ struct server_models { // update the status of a model instance (thread-safe) void update_status(const std::string & name, server_model_status status, int exit_code); void update_loaded_info(const std::string & name, std::string & raw_info); + void update_stage(const std::string & name, const std::string & stage, float progress); // wait until the model instance is fully loaded (thread-safe) // return when the model no longer in "loading" state @@ -172,6 +175,19 @@ struct server_models { // notify the router server that the sleeping state has changed static void notify_router_sleeping_state(bool sleeping); + + // notify the router server of the current load stage (progress < 0 => indeterminate phase) + static void notify_router_stage(const char * stage, float progress); + + // llama_progress_callback for tensor loading: forwards throttled "load" progress to the router + static bool child_load_progress_callback(float progress, void * user_data); + + // common_params load_stage_callback: forwards a coarse phase marker (e.g. "finalize") to the router + static void child_load_stage_callback(const char * stage, float progress, void * user_data); + + // register a process-wide download progress callback that forwards "download" progress to the router + // (must be called before model download happens, i.e. before common_params_parse) + static void register_child_download_progress(); }; struct server_models_routes { diff --git a/tools/server/server.cpp b/tools/server/server.cpp index a6ea749d0c3f..9a8e0b5ebcb8 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -82,6 +82,11 @@ int llama_server(int argc, char ** argv) { common_init(); + // child download happens during arg parsing, so register the forwarding callback beforehand + if (server_models::is_child_server()) { + server_models::register_child_download_progress(); + } + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { return 1; } @@ -298,6 +303,11 @@ int llama_server(int argc, char ** argv) { ctx_server.on_sleeping_changed([&](bool sleeping) { server_models::notify_router_sleeping_state(sleeping); }); + // forward load stages to the router: "load" via the progress callback, warmup/finalize via the stage callback + params.load_progress_callback = server_models::child_load_progress_callback; + params.load_progress_callback_user_data = nullptr; + params.load_stage_callback = server_models::child_load_stage_callback; + params.load_stage_callback_user_data = nullptr; } if (!ctx_server.load_model(params)) { diff --git a/tools/ui/src/lib/components/app/models/ModelsSelectorDropdown.svelte b/tools/ui/src/lib/components/app/models/ModelsSelectorDropdown.svelte index 40006a4c9359..2ca9592f8a2d 100644 --- a/tools/ui/src/lib/components/app/models/ModelsSelectorDropdown.svelte +++ b/tools/ui/src/lib/components/app/models/ModelsSelectorDropdown.svelte @@ -1,5 +1,5 @@
{#if isLoading} - + {#if loadPhase} + {@const Icon = loadPhase.icon} + + + + {#if loadPhase.numeric && loadProgress != null} + + + {Math.round(loadProgress * 100)}% + + {/if} + + {:else} + + {/if} {:else if isFailed}
diff --git a/tools/ui/src/lib/components/app/models/ModelsSelectorSheet.svelte b/tools/ui/src/lib/components/app/models/ModelsSelectorSheet.svelte index 2ddbf24055b8..599befec4257 100644 --- a/tools/ui/src/lib/components/app/models/ModelsSelectorSheet.svelte +++ b/tools/ui/src/lib/components/app/models/ModelsSelectorSheet.svelte @@ -1,11 +1,12 @@ + +{#if showLoading} + {#if loadPhase} + {@const Icon = loadPhase.icon} + + + + {#if loadPhase.numeric && load?.progress != null} + + + {Math.round(load.progress * 100)}% + + {/if} + + {:else} + + {/if} +{:else if isFailed} + + + +{:else} + +{/if} diff --git a/tools/ui/src/lib/components/app/models/index.ts b/tools/ui/src/lib/components/app/models/index.ts index 3ac6ecb678b2..d00b9318f057 100644 --- a/tools/ui/src/lib/components/app/models/index.ts +++ b/tools/ui/src/lib/components/app/models/index.ts @@ -65,6 +65,14 @@ export { default as ModelsSelectorList } from './ModelsSelectorList.svelte'; */ export { default as ModelsSelectorOption } from './ModelsSelectorOption.svelte'; +/** + * **ModelsSelectorTriggerIndicator** - Trailing trigger indicator + * + * Shared by the desktop dropdown and mobile sheet triggers: renders the load + * phase (icon + percent), a spinner, or the chevron for a given model. + */ +export { default as ModelsSelectorTriggerIndicator } from './ModelsSelectorTriggerIndicator.svelte'; + /** * **ModelsSelectorSheet** - Mobile model selection sheet * diff --git a/tools/ui/src/lib/components/app/models/utils.ts b/tools/ui/src/lib/components/app/models/utils.ts index ae1f511e9f66..eee5d92e7e5a 100644 --- a/tools/ui/src/lib/components/app/models/utils.ts +++ b/tools/ui/src/lib/components/app/models/utils.ts @@ -1,3 +1,4 @@ +import { ArrowDownToLine, Flame, Layers, Settings } from '@lucide/svelte'; import { SvelteMap } from 'svelte/reactivity'; import type { ModelOption } from '$lib/types/models'; @@ -6,6 +7,30 @@ export interface ModelItem { flatIndex: number; } +export interface ModelLoadPhase { + icon: typeof ArrowDownToLine; + label: string; + numeric: boolean; // true => phase reports 0..1 progress shown as a % + anim: string; // animation class for indeterminate phases +} + +// Map a router load stage (COMMON_LOAD_STAGE_* in common/common.h) to its presentation; +// shared by the dropdown trigger and the option rows so both render the same icon/label/%. +export function getModelLoadPhase(stage: string | null | undefined): ModelLoadPhase | null { + switch (stage) { + case 'download': + return { icon: ArrowDownToLine, label: 'Downloading', numeric: true, anim: '' }; + case 'load': + return { icon: Layers, label: 'Loading weights', numeric: true, anim: '' }; + case 'warmup': + return { icon: Flame, label: 'Warming up', numeric: false, anim: 'animate-pulse' }; + case 'finalize': + return { icon: Settings, label: 'Finalizing', numeric: false, anim: 'animate-spin' }; + default: + return null; + } +} + export interface OrgGroup { orgName: string | null; items: ModelItem[]; diff --git a/tools/ui/src/lib/hooks/use-models-selector.svelte.ts b/tools/ui/src/lib/hooks/use-models-selector.svelte.ts index 098cb2c27aaf..56ab33ce4cf4 100644 --- a/tools/ui/src/lib/hooks/use-models-selector.svelte.ts +++ b/tools/ui/src/lib/hooks/use-models-selector.svelte.ts @@ -110,6 +110,8 @@ export function useModelsSelector(opts: UseModelsSelectorOptions): UseModelsSele if (open) { modelsStore.fetchRouterModels().then(() => { modelsStore.fetchModalitiesForLoadedModels(); + // pick up loads started elsewhere so their progress keeps updating here + modelsStore.observeOngoingLoads(); }); } diff --git a/tools/ui/src/lib/stores/models.svelte.ts b/tools/ui/src/lib/stores/models.svelte.ts index 1990ba6049de..8504023bd615 100644 --- a/tools/ui/src/lib/stores/models.svelte.ts +++ b/tools/ui/src/lib/stores/models.svelte.ts @@ -207,7 +207,10 @@ class ModelsStore { getModelStatus(modelId: string): ServerModelStatus | null { const model = this.routerModels.find((m) => m.id === modelId); - return model?.status.value ?? null; + if (!model) return null; + // the router reports a crash as value="unloaded" + failed=true; surface it as FAILED + if (model.status.failed === true) return ServerModelStatus.FAILED; + return model.status.value ?? null; } getModelUsage(modelId: string): SvelteSet { @@ -384,8 +387,8 @@ class ModelsStore { * No-op in router mode — fetch() already calls listRouter() internally. * Kept for API compatibility (e.g. handleOpenChange dropdown open handler). */ - async fetchRouterModels(): Promise { - if (!isRouterMode()) return; + async fetchRouterModels(): Promise { + if (!isRouterMode()) return true; try { const response = await ModelsService.listRouter(); @@ -396,9 +399,11 @@ class ModelsStore { if (visible.length === 1 && this.isModelLoaded(visible[0].model)) { this.selectModelById(visible[0].id); } + return true; } catch (error) { console.warn('Failed to fetch router models:', error); this.routerModels = []; + return false; } } @@ -637,6 +642,8 @@ class ModelsStore { */ private static readonly STATUS_POLL_INTERVAL = 500; + // give up after this many consecutive failed fetches (e.g. router died mid-load) instead of spinning forever + private static readonly MAX_POLL_FAILURES = 6; /** * Poll for expected model status after load/unload operation. @@ -647,8 +654,18 @@ class ModelsStore { expectedStatus: ServerModelStatus ): Promise { let attempt = 0; + let failures = 0; while (true) { - await this.fetchRouterModels(); + const ok = await this.fetchRouterModels(); + if (!ok) { + // tolerate transient blips, but bail if the server stays unreachable + if (++failures >= ModelsStore.MAX_POLL_FAILURES) { + throw new Error('Lost connection to the server while loading the model'); + } + await new Promise((resolve) => setTimeout(resolve, ModelsStore.STATUS_POLL_INTERVAL)); + continue; + } + failures = 0; const currentStatus = this.getModelStatus(modelId); if (currentStatus === expectedStatus) return; @@ -693,6 +710,37 @@ class ModelsStore { } } + /** + * Follow a load started elsewhere (another client/tab) so this UI keeps updating its + * progress. Same path as loadModel() without the load() call — the modelLoadingStates + * guard keeps it from stacking on a load we already started or observe. + */ + observeModelLoad(modelId: string): void { + if (this.isModelLoaded(modelId)) return; + if (this.modelLoadingStates.get(modelId)) return; + + this.modelLoadingStates.set(modelId, true); + this.pollForModelStatus(modelId, ServerModelStatus.LOADED) + .then(() => this.updateModelModalities(modelId)) + // the load aborting/failing on the server is the normal end of observing, not our error + .catch((error) => console.warn(`Stopped observing load for ${modelId}:`, error)) + .finally(() => this.modelLoadingStates.set(modelId, false)); + } + + /** + * Follow every load the router currently reports, so out-of-band loads keep ticking + * here. Safe to call repeatedly — observeModelLoad() no-ops on loads already followed. + */ + observeOngoingLoads(): void { + if (!isRouterMode()) return; + + for (const model of this.routerModels) { + if (model.status.value === ServerModelStatus.LOADING && !model.status.failed) { + this.observeModelLoad(model.id); + } + } + } + async unloadModel(modelId: string): Promise { if (!this.isModelLoaded(modelId)) return; if (this.modelLoadingStates.get(modelId)) return; diff --git a/tools/ui/src/lib/types/api.d.ts b/tools/ui/src/lib/types/api.d.ts index f620d67351ba..6e63ffd05e34 100644 --- a/tools/ui/src/lib/types/api.d.ts +++ b/tools/ui/src/lib/types/api.d.ts @@ -65,6 +65,14 @@ export interface ApiModelStatus { value: ServerModelStatus; /** Command line arguments used when loading (only for loaded models) */ args?: string[]; + /** Current load phase while loading: "download" | "load" | "warmup" | "finalize" */ + stage?: string; + /** Progress 0..1 within the current phase; absent for indeterminate phases */ + progress?: number; + /** True when the model instance crashed/exited with a non-zero code (value stays "unloaded") */ + failed?: boolean; + /** Exit code of the crashed instance (present when failed is true) */ + exit_code?: number; } /**