team-wcv · team-wcv · May 7, 2026 · May 7, 2026 · May 7, 2026 · May 7, 2026
diff --git a/bench/bench_compare.py b/bench/bench_compare.py
@@ -0,0 +1,164 @@
+"""Drafter vs no-drafter A/B bench for the asymmetric cluster.
+
+For each length in --lengths, runs the same prompt twice: once with
+``use_drafter=True``, once with ``use_drafter=False``. Reports per-run
+TPS, drafter telemetry, and the speedup ratio.
+
+Sleeps briefly between runs so the model isn't warm-cache for one and
+cold for the other; first run of each length pair is the
+"throw-away" warmup, subsequent are timed (when --warmup is set).
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import time
+import urllib.request
+from typing import Final
+
+API_URL: Final[str] = "http://192.168.1.224:52415/v1/chat/completions"
+MODEL: Final[str] = "mlx-community/gemma-4-31b-it-bf16"
+
+PROMPT: Final[str] = (
+    "Write a detailed, comprehensive technical reference on distributed "
+    "speculative decoding for large language models. Cover the following "
+    "topics in depth, with examples, equations, and pseudocode where "
+    "relevant: (1) architectural foundations of speculative decoding, "
+    "(2) the role of drafter vs target models and how acceptance/rejection "
+    "is computed, (3) multi-token prediction (MTP) heads vs separate drafter "
+    "models, (4) tensor-parallel verification and KV cache rollback semantics, "
+    "(5) asymmetric placement on heterogeneous clusters, (6) wire-protocol "
+    "design for drafter/target IPC, (7) failure modes (drafter death, target "
+    "rank crashes, partitions) and recovery strategies, (8) tuning K (draft "
+    "depth) for different workloads, (9) integration with continuous batching "
+    "and paged attention, (10) practical performance results from real "
+    "deployments. Use markdown headings and detailed prose. Begin now."
+)
+
+
+def run_once(max_tokens: int, use_drafter: bool, timeout: int) -> dict[str, object]:
+    body: dict[str, object] = {
+        "model": MODEL,
+        "messages": [{"role": "user", "content": PROMPT}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0,
+        "stream": False,
+        "use_drafter": use_drafter,
+    }
+    payload = json.dumps(body).encode("utf-8")
+    request = urllib.request.Request(
+        API_URL,
+        data=payload,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    started = time.monotonic()
+    with urllib.request.urlopen(request, timeout=timeout) as resp:  # noqa: S310 - lan
+        raw = resp.read().decode("utf-8")
+    wall = time.monotonic() - started
+    parsed = json.loads(raw)
+    usage = parsed.get("usage") or {}
+    completion = int(usage.get("completion_tokens", 0))
+    stats = parsed.get("generation_stats") or {}
+    return {
+        "max_tokens": max_tokens,
+        "use_drafter": use_drafter,
+        "wall_s": round(wall, 2),
+        "completion_tokens": completion,
+        "tps_total": round(completion / wall, 2) if wall > 0 else 0.0,
+        "drafter_model_id": stats.get("drafter_model_id"),
+        "draft_mode": stats.get("draft_mode"),
+        "num_draft_tokens": stats.get("num_draft_tokens"),
+        "accepted_draft_tokens": stats.get("accepted_draft_tokens"),
+        "proposed_draft_tokens": stats.get("proposed_draft_tokens"),
+        "spec_decode_rounds": stats.get("spec_decode_rounds"),
+        "acceptance_rate": (
+            round(stats["accepted_draft_tokens"] / stats["proposed_draft_tokens"], 3)
+            if stats.get("proposed_draft_tokens")
+            else None
+        ),
+        "fraction_from_drafter": (
+            round(stats["accepted_draft_tokens"] / completion, 3)
+            if stats.get("accepted_draft_tokens") and completion
+            else None
+        ),
+        "finish_reason": parsed.get("choices", [{}])[0].get("finish_reason"),
+    }
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lengths", type=int, nargs="+", default=[256, 1024, 2048])
+    parser.add_argument("--timeout", type=int, default=900)
+    parser.add_argument("--out", type=str, default="/tmp/bench_compare.json")
+    parser.add_argument(
+        "--sleep-between",
+        type=float,
+        default=2.0,
+        help="Seconds to sleep between runs to let the master settle.",
+    )
+    args = parser.parse_args()
+
+    results: list[dict[str, object]] = []
+    summary: list[dict[str, object]] = []
+    for length in args.lengths:
+        print(f"\n=== max_tokens={length} ===", flush=True)
+        # Run no-drafter first; the drafter run inherits a warm prompt cache
+        # via prefix-cache-hit, but max_tokens drives the bulk of the
+        # measured time so this is fine for steady-state TPS comparison.
+        for use_drafter in (False, True):
+            try:
+                r = run_once(length, use_drafter, args.timeout)
+            except Exception as exc:  # noqa: BLE001 - report bench failure
+                r = {
+                    "max_tokens": length,
+                    "use_drafter": use_drafter,
+                    "error": f"{type(exc).__name__}: {exc}",
+                }
+            results.append(r)
+            print(json.dumps(r, indent=2), flush=True)
+            time.sleep(args.sleep_between)
+
+        # Speedup summary for this length pair
+        no_draft = next(
+            (
+                r
+                for r in results
+                if r.get("max_tokens") == length and not r.get("use_drafter")
+            ),
+            None,
+        )
+        draft = next(
+            (
+                r
+                for r in results
+                if r.get("max_tokens") == length and r.get("use_drafter")
+            ),
+            None,
+        )
+        if no_draft and draft and "error" not in no_draft and "error" not in draft:
+            tps_no = float(no_draft.get("tps_total", 0.0) or 0)
+            tps_yes = float(draft.get("tps_total", 0.0) or 0)
+            speedup = round(tps_yes / tps_no, 3) if tps_no > 0 else None
+            row = {
+                "max_tokens": length,
+                "tps_no_drafter": tps_no,
+                "tps_drafter": tps_yes,
+                "speedup_x": speedup,
+                "acceptance_rate": draft.get("acceptance_rate"),
+                "fraction_from_drafter": draft.get("fraction_from_drafter"),
+            }
+            print(f"\n>>> speedup at {length}: {json.dumps(row)}", flush=True)
+            summary.append(row)
+
+    out = {"summary": summary, "raw": results}
+    with open(args.out, "w", encoding="utf-8") as fh:
+        json.dump(out, fh, indent=2)
+    print("\n=== overall summary ===", flush=True)
+    print(json.dumps(summary, indent=2), flush=True)
+    print(f"Saved: {args.out}", flush=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bench/bench_concurrent.py b/bench/bench_concurrent.py
@@ -0,0 +1,150 @@
+"""Concurrent overlapping spec-decode bench for the asymmetric cluster.
+
+Fires N parallel chat-completions requests (each with the drafter
+enabled) at the master and measures:
+  - Per-request wall time, completion tokens, individual TPS
+  - Aggregate cluster TPS (sum of per-request tokens / max wall)
+  - Time-to-first-token spread
+
+The point: validate that EXO_MAX_CONCURRENT_REQUESTS > 1 actually
+overlaps spec-decode sessions correctly. Single-rank-target placements
+trivially share KV; multi-rank tensor-parallel placements with an
+asymmetric drafter are the interesting case here.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import threading
+import time
+import urllib.request
+from typing import Final
+
+API_URL: Final[str] = "http://192.168.1.224:52415/v1/chat/completions"
+MODEL: Final[str] = "mlx-community/gemma-4-31b-it-bf16"
+
+PROMPTS: Final[list[str]] = [
+    "Explain the architecture of distributed speculative decoding in "
+    "one paragraph, then list six common failure modes with mitigations.",
+    "Write a 400-word technical brief on tensor-parallel KV cache "
+    "rollback semantics, including pseudocode for accept/reject.",
+    "Outline the difference between MTP heads and external drafter "
+    "models, and discuss when each is preferable for low-latency serving.",
+    "Describe how an n-gram drafter integrates with a transformer "
+    "target model, with attention to stateful processors and RNG.",
+    "Summarize the trade-offs of pipelined vs synchronous spec-decode, "
+    "including their interaction with continuous batching.",
+    "Walk through the wire protocol of a drafter-target IPC channel "
+    "designed for sub-millisecond round-trip on local sockets.",
+    "Explain how acceptance probability is computed for vanilla "
+    "speculative decoding and when greedy acceptance is sound.",
+    "Discuss the engineering trade-offs between more drafter heads "
+    "and more drafter depth for a fixed quality bar.",
+]
+
+
+def run_one(
+    idx: int,
+    prompt: str,
+    max_tokens: int,
+    timeout: int,
+    results: list[dict[str, object]],
+    started_at: float,
+) -> None:
+    body = {
+        "model": MODEL,
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": max_tokens,
+        "temperature": 0.0,
+        "stream": False,
+        "use_drafter": True,
+    }
+    payload = json.dumps(body).encode("utf-8")
+    req = urllib.request.Request(
+        API_URL,
+        data=payload,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    relative_start = time.monotonic() - started_at
+    t0 = time.monotonic()
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:  # noqa: S310 - lan
+            raw = resp.read().decode("utf-8")
+        wall = time.monotonic() - t0
+        parsed = json.loads(raw)
+        usage = parsed.get("usage", {})
+        completion = int(usage.get("completion_tokens", 0))
+        result: dict[str, object] = {
+            "idx": idx,
+            "relative_start_s": round(relative_start, 2),
+            "wall_s": round(wall, 2),
+            "completion_tokens": completion,
+            "tps": round(completion / wall, 2) if wall > 0 else 0.0,
+            "finish_reason": parsed.get("choices", [{}])[0].get("finish_reason"),
+            "first_64": (
+                parsed.get("choices", [{}])[0]
+                .get("message", {})
+                .get("content", "")[:64]
+            ),
+        }
+    except Exception as exc:  # noqa: BLE001 - report bench failure
+        wall = time.monotonic() - t0
+        result = {
+            "idx": idx,
+            "relative_start_s": round(relative_start, 2),
+            "wall_s": round(wall, 2),
+            "error": f"{type(exc).__name__}: {exc}",
+        }
+    results.append(result)
+    print(json.dumps(result), flush=True)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--concurrency", type=int, default=4)
+    parser.add_argument("--max-tokens", type=int, default=512)
+    parser.add_argument("--timeout", type=int, default=600)
+    parser.add_argument("--out", type=str, default="/tmp/bench_concurrent.json")
+    args = parser.parse_args()
+
+    n = args.concurrency
+    prompts = (PROMPTS * ((n + len(PROMPTS) - 1) // len(PROMPTS)))[:n]
+
+    results: list[dict[str, object]] = []
+    started_at = time.monotonic()
+    threads: list[threading.Thread] = []
+    for i, p in enumerate(prompts):
+        thread = threading.Thread(
+            target=run_one,
+            args=(i, p, args.max_tokens, args.timeout, results, started_at),
+        )
+        threads.append(thread)
+        thread.start()
+    for thread in threads:
+        thread.join()
+    total_wall = time.monotonic() - started_at
+
+    completed = [r for r in results if "error" not in r]
+    total_tokens = sum(int(r.get("completion_tokens", 0)) for r in completed)
+    aggregate_tps = round(total_tokens / total_wall, 2) if total_wall > 0 else 0.0
+    summary = {
+        "concurrency": n,
+        "max_tokens": args.max_tokens,
+        "total_wall_s": round(total_wall, 2),
+        "total_tokens_completed": total_tokens,
+        "aggregate_tps": aggregate_tps,
+        "successful": len(completed),
+        "failed": n - len(completed),
+        "individual": sorted(results, key=lambda r: int(r["idx"])),
+    }
+    print(json.dumps(summary, indent=2), flush=True)
+
+    with open(args.out, "w", encoding="utf-8") as fh:
+        json.dump(summary, fh, indent=2)
+    print(f"Saved: {args.out}", flush=True)
+
+
+if __name__ == "__main__":
+    main()