Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 6 additions & 43 deletions .github/workflows/cpp_server_build_test_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1178,49 +1178,12 @@ jobs:
Pop-Location
Write-Host "Embeddable Windows smoke test PASSED!" -ForegroundColor Green

test-dmg-inference:
name: Test .dmg - llamacpp (metal)
runs-on: macos-latest
needs: build-lemonade-macos-dmg
# Skip inference tests when signing is enabled (tag pushes or manual workflow_dispatch)
# Also skip when no signing secrets (tests already ran inline in build job)
if: ${{ needs.build-lemonade-macos-dmg.outputs.has_signing == 'true' && !startsWith(github.ref, 'refs/tags/') && inputs.enable_signing != true }}
env:
LEMONADE_CI_MODE: "True"
PYTHONIOENCODING: utf-8
GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
LEMONADE_VERSION: ${{ needs.build-lemonade-macos-dmg.outputs.version }}
GGML_METAL_NO_RESIDENCY: "1"
steps:
- uses: actions/checkout@v5

- name: Set HF_HOME environment variable
run: echo "HF_HOME=$PWD/hf-cache" >> $GITHUB_ENV

- name: Install Lemonade Server (.pkg)
uses: ./.github/actions/install-lemonade-server-dmg
with:
version: ${{ env.LEMONADE_VERSION }}

- name: Setup Python and virtual environment
uses: ./.github/actions/setup-venv
with:
venv-name: '.venv'
python-version: '3.10'
requirements-file: 'test/requirements.txt'

- name: Test llamacpp (metal)
env:
HF_HOME: ${{ env.HF_HOME }}
run: |
set -e
.venv/bin/python test/server_llm.py --wrapped-server llamacpp --backend metal --server-binary /usr/local/bin/lemonade-server

- name: Capture and upload server logs
if: always()
uses: ./.github/actions/capture-server-logs
with:
artifact-name: server-logs-dmg-llamacpp
# Metal llama.cpp inference tests are intentionally not run in CI:
# GitHub-hosted macOS runners are virtualized and Apple's Virtualization
# Framework blocks Metal GPU access. llama.cpp's own CI faces this same
# limitation and only build-tests Metal. Metal inference must be validated
# manually on Apple Silicon hardware before bumping metal in
# backend_versions.json.

# ========================================================================
# CLI AND ENDPOINTS TESTS - Run on GitHub-hosted runners (no GPU needed)
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/resources/backend_versions.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"rocm-stable": "b8653",
"rocm-preview": "b8705",
"rocm-nightly": "b1238",
"metal": "b8460",
"metal": "b8766",
"cpu": "b8766"
},
"whispercpp": {
Expand Down
6 changes: 2 additions & 4 deletions src/cpp/resources/server_models.json
Original file line number Diff line number Diff line change
Expand Up @@ -873,8 +873,7 @@
"suggested": true,
"labels": [
"vision",
"tool-calling",
"hot"
"tool-calling"
],
"size": 19.7
},
Expand All @@ -885,8 +884,7 @@
"suggested": true,
"labels": [
"vision",
"tool-calling",
"hot"
"tool-calling"
],
"size": 68.4
},
Expand Down
6 changes: 6 additions & 0 deletions test/server_cli2.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import unittest
import uuid

from utils.capabilities import skip_on_macos_ci
from utils.server_base import wait_for_server
from utils.test_models import (
ENDPOINT_TEST_MODEL,
Expand Down Expand Up @@ -751,6 +752,7 @@ def test_068_import_remote_noninteractive_requires_recipe_file(self):
# Load Tests
# =============================================================================

@skip_on_macos_ci
def test_070_load_with_ctx_size(self):
"""Test load command with --ctx-size option."""
result = run_cli_command(
Expand All @@ -759,6 +761,7 @@ def test_070_load_with_ctx_size(self):
)
print(f"Load with ctx-size exit code: {result.returncode}")

@skip_on_macos_ci
def test_071_load_with_save_options(self):
"""Test load command with --save-options flag."""
result = run_cli_command(
Expand All @@ -771,6 +774,7 @@ def test_071_load_with_save_options(self):
# Run Tests
# =============================================================================

@skip_on_macos_ci
def test_100_run_with_model(self):
"""Test run command with explicit model."""
with tempfile.TemporaryDirectory(prefix="lemonade-open-stub-") as temp_dir:
Expand All @@ -782,6 +786,7 @@ def test_100_run_with_model(self):
)
self.assertEqual(result.returncode, 0)

@skip_on_macos_ci
def test_101_run_with_combined_options(self):
"""Test run command with --ctx-size and --save-options together."""
with tempfile.TemporaryDirectory(prefix="lemonade-open-stub-") as temp_dir:
Expand All @@ -793,6 +798,7 @@ def test_101_run_with_combined_options(self):
)
self.assertEqual(result.returncode, 0)

@skip_on_macos_ci
def test_102_run_with_host_port(self):
"""Test run command using global --host/--port options."""
with tempfile.TemporaryDirectory(prefix="lemonade-open-stub-") as temp_dir:
Expand Down
26 changes: 20 additions & 6 deletions test/server_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import requests
from openai import NotFoundError

from utils.capabilities import skip_on_macos_ci
from utils.server_base import (
ServerTestBase,
run_server_tests,
Expand Down Expand Up @@ -330,6 +331,7 @@ def test_008_pull_model_streaming(self):

print(f"[OK] Pull (streaming): received events: {set(events_received)}")

@skip_on_macos_ci
def test_009_load_model_basic(self):
"""Test loading a model into memory."""
# Model is already pulled (setUpClass or previous pull tests)
Expand Down Expand Up @@ -358,6 +360,7 @@ def test_009_load_model_basic(self):

print(f"[OK] Loaded model: {ENDPOINT_TEST_MODEL}")

@skip_on_macos_ci
def test_010_load_model_with_options(self):
"""Test loading a model with custom options (ctx_size, llamacpp_backend, llamacpp_args)."""
# Load with custom options (reloads only if options differ from current)
Expand Down Expand Up @@ -396,6 +399,7 @@ def test_010_load_model_with_options(self):

print(f"[OK] Loaded model with ctx_size={custom_ctx_size}")

@skip_on_macos_ci
def test_011_load_model_save_options(self):
"""Test save_options=true saves settings to recipe_options.json."""
custom_ctx_size = 4096
Expand Down Expand Up @@ -425,6 +429,7 @@ def test_011_load_model_save_options(self):
)
print(f"[OK] Verified saved ctx_size={custom_ctx_size} via model info")

@skip_on_macos_ci
def test_012_load_uses_saved_options(self):
"""Test that load reads previously saved options from recipe_options.json."""
# First, save options with a specific ctx_size
Expand Down Expand Up @@ -472,6 +477,7 @@ def test_012_load_uses_saved_options(self):
print(f"[OK] Load used saved ctx_size={custom_ctx_size}")
break

@skip_on_macos_ci
def test_012a_load_idempotent_same_options(self):
"""Test that /load is idempotent: loading an already-loaded model with
the same options is a no-op (no eviction or reload).
Expand Down Expand Up @@ -506,6 +512,7 @@ def test_012a_load_idempotent_same_options(self):
)
print(f"[OK] Idempotent /load with same options was a no-op ({elapsed:.3f}s)")

@skip_on_macos_ci
def test_012b_load_reloads_on_option_change(self):
"""Test that /load evicts and reloads when options differ."""
# Ensure model is loaded with default options (no ctx_size override)
Expand All @@ -531,7 +538,8 @@ def test_012b_load_reloads_on_option_change(self):
opts_before = m.get("recipe_options", {})
break
self.assertNotEqual(
opts_before.get("ctx_size"), 2048,
opts_before.get("ctx_size"),
2048,
"Precondition: model should not already have ctx_size=2048",
)

Expand Down Expand Up @@ -559,8 +567,11 @@ def test_012b_load_reloads_on_option_change(self):
"Option-change /load should reload with new options",
)

print(f"[OK] /load with different options triggered reload (ctx_size={custom_ctx})")
print(
f"[OK] /load with different options triggered reload (ctx_size={custom_ctx})"
)

@skip_on_macos_ci
def test_012c_load_noop_when_already_loaded_by_inference(self):
"""Regression test for #1603: /load after an inference-triggered
auto-load should no-op, not evict and reload the model.
Expand Down Expand Up @@ -612,11 +623,10 @@ def test_012c_load_noop_when_already_loaded_by_inference(self):
)

# Model should still be loaded
health = requests.get(
f"{self.base_url}/health", timeout=TIMEOUT_DEFAULT
).json()
health = requests.get(f"{self.base_url}/health", timeout=TIMEOUT_DEFAULT).json()
loaded = [
m for m in health.get("all_models_loaded", [])
m
for m in health.get("all_models_loaded", [])
if m["model_name"] == ENDPOINT_TEST_MODEL
]
self.assertEqual(
Expand All @@ -625,6 +635,7 @@ def test_012c_load_noop_when_already_loaded_by_inference(self):

print(f"[OK] /load after auto-load was a no-op ({elapsed:.3f}s)")

@skip_on_macos_ci
def test_013_unload_specific_model(self):
"""Test unloading a specific model by name."""
# First load a model
Expand Down Expand Up @@ -686,6 +697,7 @@ def test_014_unload_nonexistent_model(self):

print("[OK] 404 returned for unloading non-existent model")

@skip_on_macos_ci
def test_015_unload_all_models(self):
"""Test unloading all models without specifying model_name."""
# First load a model
Expand Down Expand Up @@ -925,6 +937,7 @@ def test_020_web_app_root(self):
)
print(f"[OK] GET / returned HTML ({len(body)} bytes)")

@skip_on_macos_ci
def test_021_stats_endpoint(self):
"""Test the /stats endpoint returns performance metrics."""
# First, make an inference request to populate stats
Expand Down Expand Up @@ -1146,6 +1159,7 @@ def test_021a_pull_sdcpp_import_preserves_merged_recipe_options(self):
except Exception:
pass

@skip_on_macos_ci
def test_021b_appear_builtin_aliases_user_model(self):
"""User models labeled appear-builtin should expose a bare public ID."""
canonical_name = f"user.AppearBuiltin-{uuid.uuid4().hex[:8]}"
Expand Down
32 changes: 26 additions & 6 deletions test/server_streaming_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import requests

from utils.capabilities import skip_on_macos_ci
from utils.server_base import ServerTestBase, run_server_tests
from utils.test_models import (
PORT,
Expand All @@ -23,7 +24,9 @@
class StreamingErrorTests(ServerTestBase):
"""Tests that streaming responses terminate cleanly on all code paths."""

def _post_streaming(self, model_name, messages=None, tools=None, timeout=TIMEOUT_DEFAULT):
def _post_streaming(
self, model_name, messages=None, tools=None, timeout=TIMEOUT_DEFAULT
):
"""Send a streaming chat/completions request and return the raw response."""
if messages is None:
messages = [{"role": "user", "content": "Say hello."}]
Expand Down Expand Up @@ -65,8 +68,11 @@ def test_002_invalid_model_name_stream_terminates_cleanly(self):
"""Streaming request with a malformed model name terminates cleanly."""
response = self._post_streaming("org/repo:invalid-tag-does-not-exist")
lines = self._consume_stream(response)
print(f"[OK] Malformed model name: stream closed cleanly ({len(lines)} line(s))")
print(
f"[OK] Malformed model name: stream closed cleanly ({len(lines)} line(s))"
)

@skip_on_macos_ci
def test_003_streaming_after_unload_terminates_cleanly(self):
"""Streaming request after unloading all models terminates cleanly."""
unload_resp = requests.post(
Expand All @@ -80,6 +86,7 @@ def test_003_streaming_after_unload_terminates_cleanly(self):
lines = self._consume_stream(response)
print(f"[OK] Post-unload: stream closed cleanly ({len(lines)} line(s))")

@skip_on_macos_ci
def test_004_streaming_context_overflow_terminates_cleanly(self):
"""Context-overflow prompt causes backend non-200; stream must still terminate.

Expand All @@ -105,6 +112,7 @@ def test_004_streaming_context_overflow_terminates_cleanly(self):
lines = self._consume_stream(response)
print(f"[OK] Context overflow: stream closed cleanly ({len(lines)} line(s))")

@skip_on_macos_ci
def test_005_streaming_with_many_tools_terminates_cleanly(self):
"""Streaming with 15 tools terminates cleanly (original bug report scenario)."""
many_tools = [
Expand All @@ -116,9 +124,18 @@ def test_005_streaming_with_many_tools_terminates_cleanly(self):
"parameters": {
"type": "object",
"properties": {
"input": {"type": "string", "description": f"Input for tool {i}."},
"count": {"type": "integer", "description": f"Repeat count for tool {i}."},
"dry_run": {"type": "boolean", "description": "Simulate without executing."},
"input": {
"type": "string",
"description": f"Input for tool {i}.",
},
"count": {
"type": "integer",
"description": f"Repeat count for tool {i}.",
},
"dry_run": {
"type": "boolean",
"description": "Simulate without executing.",
},
},
"required": ["input"],
},
Expand All @@ -136,6 +153,7 @@ def test_005_streaming_with_many_tools_terminates_cleanly(self):
lines = self._consume_stream(response)
print(f"[OK] 15 tools: stream closed cleanly ({len(lines)} line(s))")

@skip_on_macos_ci
def test_006_successful_stream_includes_done_marker(self):
"""Successful streaming response includes [DONE] and terminates cleanly."""
load_resp = requests.post(
Expand All @@ -156,7 +174,9 @@ def test_006_successful_stream_includes_done_marker(self):
any("[DONE]" in line for line in lines),
f"[DONE] marker not found in stream. Lines: {lines[:10]}",
)
print(f"[OK] Happy path: [DONE] received, stream closed cleanly ({len(lines)} line(s))")
print(
f"[OK] Happy path: [DONE] received, stream closed cleanly ({len(lines)} line(s))"
)


if __name__ == "__main__":
Expand Down
Loading
Loading