Skip to content
84 changes: 84 additions & 0 deletions common/download.cpp

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @angt if you can have a quick look on common_download_remove (not sure if you prefer this part to be a dedicate PR?)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i was doing a PR with the same function hahaha

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While working on llama cache and llama download

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah yeah perfect, I'll wait for your implementation then

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you're ready to merge, let's go with this, i'll adapt mine 👍

Original file line number Diff line number Diff line change
Expand Up @@ -997,3 +997,87 @@ std::vector<common_cached_model_info> common_list_cached_models() {

return result;
}

bool common_download_remove(const std::string & hf_repo_with_tag) {
namespace fs = std::filesystem;

auto [repo_id, tag] = common_download_split_repo_tag(hf_repo_with_tag);

if (tag.empty()) {
return hf_cache::remove_cached_repo(repo_id);
}

std::string tag_upper = tag;
for (char & c : tag_upper) {
c = (char) std::toupper((unsigned char) c);
}

auto files = hf_cache::get_cached_files(repo_id);
if (files.empty()) {
return false;
}

// collect snapshot entries whose tag matches
std::vector<fs::path> to_remove;
for (const auto & f : files) {
auto split = get_gguf_split_info(f.path);
if (split.tag == tag_upper) {
to_remove.emplace_back(f.local_path);
}
}

if (to_remove.empty()) {
return false;
}

// resolve blob paths from symlinks before deleting snapshot entries
std::vector<fs::path> blobs_to_check;
for (const auto & p : to_remove) {
std::error_code ec;
if (fs::is_symlink(p, ec)) {
auto target = fs::read_symlink(p, ec);
if (!ec) {
blobs_to_check.push_back((p.parent_path() / target).lexically_normal());
}
}
}

// remove snapshot entries
for (const auto & p : to_remove) {
std::error_code ec;
fs::remove(p, ec);
if (ec) {
LOG_WRN("%s: failed to remove %s: %s\n", __func__, p.string().c_str(), ec.message().c_str());
}
}

if (blobs_to_check.empty()) {
return true;
}

// collect blobs still referenced by remaining snapshot entries
std::unordered_set<std::string> still_referenced;
for (const auto & f : hf_cache::get_cached_files(repo_id)) {
fs::path p(f.local_path);
std::error_code ec;
if (fs::is_symlink(p, ec)) {
auto target = fs::read_symlink(p, ec);
if (!ec) {
still_referenced.insert((p.parent_path() / target).lexically_normal().string());
}
}
}

// remove orphaned blobs
for (const auto & blob : blobs_to_check) {
if (still_referenced.find(blob.string()) == still_referenced.end()) {
std::error_code ec;
fs::remove(blob, ec);
if (ec) {
LOG_WRN("%s: failed to remove blob %s: %s\n", __func__, blob.string().c_str(), ec.message().c_str());
}
}
}

return true;
}
7 changes: 7 additions & 0 deletions common/download.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,10 @@ int common_download_file_single(const std::string & url,
// resolve and download model from Docker registry
// return local path to downloaded model file
std::string common_docker_resolve_model(const std::string & docker);

// Remove a cached model from disk
// input format: "user/model" or "user/model:tag"
// - if tag is omitted, removes the entire repo cache directory
// - if tag is present, removes only files matching that tag (and orphaned blobs)
// returns true if anything was removed
bool common_download_remove(const std::string & hf_repo_with_tag);
15 changes: 15 additions & 0 deletions common/hf-cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -495,4 +495,19 @@ std::string finalize_file(const hf_file & file) {
return file.final_path;
}

bool remove_cached_repo(const std::string & repo_id) {
if (!is_valid_repo_id(repo_id)) {
LOG_WRN("%s: invalid repository: %s\n", __func__, repo_id.c_str());
return false;
}
fs::path repo_path = get_repo_path(repo_id);
std::error_code ec;
auto removed = fs::remove_all(repo_path, ec);
if (ec) {
LOG_ERR("%s: failed to remove repo cache %s: %s\n", __func__, repo_path.string().c_str(), ec.message().c_str());
return false;
}
return removed > 0;
}

} // namespace hf_cache
3 changes: 3 additions & 0 deletions common/hf-cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,7 @@ hf_files get_cached_files(const std::string & repo_id = {});
// Create snapshot path (link or move/copy) and return it
std::string finalize_file(const hf_file & file);

// Remove the entire cached directory for a repo, returns true if removed
bool remove_cached_repo(const std::string & repo_id);

} // namespace hf_cache
18 changes: 18 additions & 0 deletions tools/server/README-dev.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,24 @@ That requires `JSON.stringify` when formatted to message content:
}
```

### Model management API (router mode)

Model management API was added via PR [#23976](https://github.com/ggml-org/llama.cpp/pull/23976)

The main goal of this API is to allow downloading models and/or removing models from the web UI. It relies on the model cache infrastructure under the hood to manage the list of models dynamically.

Instead of building everything from the ground up (like what most AI agents will do when you ask them to implement a similar feature), we built on top of existing, already well-engineered components inside the codebase:
- Model cache infrastructure as mentioned above (`common/download.h`)
- Server response queue (`server-queue.h`). We use this feature to broadcast events to SSE clients.
- Server router thread management (`server-models.h`). We re-use the same thread model that is used for managing subprocess life cycle, except that we don't create a new subprocess, but launch the download right inside the thread.

The flow for downloading a new model:
- POST request comes in --> `post_router_models` --> validation
- `server_models::download()` is called
- Sets up a new thread `inst.th` and runs the download inside
- If a stop request comes in, set `stop_download` to `true`
- Otherwise, upon completion, we call `load_models()` to refresh the list of models

### Notable Related PRs

- Initial server implementation: https://github.com/ggml-org/llama.cpp/pull/1443
Expand Down
115 changes: 115 additions & 0 deletions tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1778,6 +1778,20 @@ The `status` object can be:
}
```

Note: for "downloading" state, there can be multiple files be downloading in parallel

```json
"status": {
"value": "downloading",
"progress": {
"https://...model.gguf": {
"done": 195963406,
"total": 219307424
}
}
}
```

### POST `/models/load`: Load a model

Load a model
Expand Down Expand Up @@ -1820,6 +1834,107 @@ Response:
}
```

### GET `/models/sse`: Real-time events

Example events:

```js
{
"model": "...",
"event": "model_status",
"data": {
"status": "loading"
}
}

{
"model": "...",
"event": "download_progress",
"data": {
// note: there can be multiple files being downloaded in parallel
"https://...model.gguf": {
"done": 195963406,
"total": 219307424
}
}
}

{
"model": "...",
"event": "download_finished",
"data": {
"status": "loading"
}
}

{
"model": "...",
"event": "model_remove"
}

// special event: reload of the list of all models
{
"model": "*",
"event": "models_reload"
}
```

### POST `/models`: Download new model

Trigger a new download (non-blocking), the progress can be tracked via SSE endpoint `/models/sse`

To cancel model downloading, send an event to `/models/unload`

Download procedure:
- Send POST request to `/models`
- Subscribe to `/models/sse` for updates
- On downloading completed, you will receive either `download_finished` or `download_failed` event
- Call GET `/models` to trigger model list update. If the download success, you should see the new model in the list

Payload:

```json
{
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M",
}
```

Response (download is started in the background):

```json
{
"success": true
}
```

Response (error, cannot start the download):

```json
{
"error": {
"code": 400,
"message": "model validation failed, unable to download",
"type": "invalid_request_error"
}
}
```

### DELETE `/models`: Delete a model from cache

IMPORTANT: only model stored in cache can be deleted. You cannot delete models in a preset.

Model name must be passed via query param: `?model={name}`

If delete success, it will send an SSE event of type `model_remove`

Response:

```json
{
"success": true
}
```

## API errors

`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
Expand Down
17 changes: 17 additions & 0 deletions tools/server/server-http.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,23 @@ void server_http_context::post(const std::string & path, const server_http_conte
});
}

void server_http_context::del(const std::string & path, const server_http_context::handler_t & handler) const {
handlers.emplace(path, handler);
pimpl->srv->Delete(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
server_http_req_ptr request = std::make_unique<server_http_req>(server_http_req{
get_params(req),
get_headers(req),
req.path,
build_query_string(req),
req.body,
{},
req.is_connection_closed
});
server_http_res_ptr response = handler(*request);
process_handler_response(std::move(request), response, res);
});
}

//
// Vertex AI Prediction protocol (AIP_PREDICT_ROUTE)
// https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements
Expand Down
1 change: 1 addition & 0 deletions tools/server/server-http.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ struct server_http_context {

void get(const std::string & path, const handler_t & handler) const;
void post(const std::string & path, const handler_t & handler) const;
void del(const std::string & path, const handler_t & handler) const;

// Register the Google Cloud Platform (Vertex AI) compat (AIP_PREDICT_ROUTE env var, or /predict)
// Must be called AFTER all other API routes are registered
Expand Down
Loading
Loading