PrimeIntellect-ai · mikasenghaas · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/bench/.gitignore b/bench/.gitignore
@@ -1 +1,3 @@
 *.png
+!results/single_turn.png
+!results/agentic.png
diff --git a/bench/agentic_benchmark.sh b/bench/agentic_benchmark.sh
@@ -1,51 +1,52 @@
 #!/usr/bin/env bash
-# Agentic benchmark: run ONE harbor task at group sizes (-r) across env-server modes and
-# write bench/agentic_benchmark.json (per-rollout durations + e2e wall clock), which
-# bench/agentic_aggregate.py summarizes. Each rollout is its own sandbox (a coding agent +
-# the harbor verifier); with no group reward the rollouts are independent, so the worker
-# pool round-robins them across workers — this stresses concurrent agentic execution +
-# scoring (where the single-loop server is most likely to stall).
+# Agentic runtime benchmark: run ONE harbor task at group sizes (-r) across container
+# runtimes (docker / prime), writing ONE results file per matrix cell to
+# bench/results/agentic/<runtime>-r<rollouts>.json — so re-running a subset refreshes only
+# those cells. Each rollout is its own container/sandbox (a coding agent + the harbor
+# verifier); rollouts are independent, so this stresses concurrent agentic execution + scoring.
 #
 #   bench/agentic_benchmark.sh
-#   ROLLOUTS="8 16" WORKERS="0 4" TASK=fix-git bench/agentic_benchmark.sh
+#   RUNTIMES="docker" MAX_CONCURRENT=64 TASK=fix-git bench/agentic_benchmark.sh
 #
-# Compares WORKERS modes: 0 = single in-process server, N = an N-worker pool. Needs the
-# `harbor` CLI (`uv tool install harbor`) and the `terminal-bench-2-v1` example taskset
-# (an editable dep), plus a container runtime (prime default; PRIME_API_KEY in ~/.env).
+# Needs the `harbor` CLI (`uv tool install harbor`) and the `terminal-bench-2-v1` example
+# taskset (an editable dep); docker needs the daemon, prime needs PRIME_API_KEY in ~/.env.
+# Concurrency defaults to unbounded; cap it with MAX_CONCURRENT.
 set -uo pipefail
 
 TASKSET="${TASKSET:-terminal-bench-2-v1}"
 TASK="${TASK:-fix-git}"
-RUNTIME="${RUNTIME:-prime}"
-ROLLOUTS="${ROLLOUTS:-32 64 128}"
-WORKERS="${WORKERS:-0 4}"
+RUNTIMES="${RUNTIMES:-docker prime}"
+ROLLOUTS="${ROLLOUTS:-64 512}"
+MAX_CONCURRENT="${MAX_CONCURRENT:-None}"  # None = unbounded rollouts in flight
 MODEL="${MODEL:-deepseek/deepseek-v4-flash}"
-MAX_TURNS="${MAX_TURNS:-30}"
+MAX_TURNS="${MAX_TURNS:-32}"
 
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "$ROOT"
-set -a; . "$HOME/.env" 2>/dev/null || true; set +a
+# Creds for the model endpoint (+ prime runtime); skip the FIFO read if already in the env.
+[ -n "${PRIME_API_KEY:-}" ] || { set -a; . "$HOME/.env" 2>/dev/null || true; set +a; }
 
-OUT="/tmp/vbench/agentic"
-rm -rf "$OUT"; mkdir -p "$OUT"; : > "$OUT/e2e.txt"
-for w in $WORKERS; do
+WORK="/tmp/vbench/agentic"            # scratch (raw eval outputs), wiped each run
+RESULTS="$ROOT/bench/results/agentic"   # committed per-cell results, NOT wiped
+rm -rf "$WORK"; mkdir -p "$WORK" "$RESULTS"; : > "$WORK/e2e.txt"
+for rt in $RUNTIMES; do
   for r in $ROLLOUTS; do
-    label="w$w-r$r"
-    echo "== $label (task=$TASK runtime=$RUNTIME max_turns=$MAX_TURNS) =="
+    label="$rt-r$r"
+    echo "== $label (task=$TASK max_concurrent=$MAX_CONCURRENT max_turns=$MAX_TURNS) =="
     start=$(date +%s)
     uv run eval "$TASKSET" --taskset.tasks "[\"$TASK\"]" \
-      --harness.id default --harness.enable_bash true --harness.runtime.type "$RUNTIME" \
-      --num_tasks 1 --num_rollouts "$r" --num_workers "$w" \
-      --max_concurrent 512 --retry.attempts 1 --max_turns "$MAX_TURNS" \
-      --rich false --output_dir "$OUT/$label" \
-      > "$OUT/$label.stdout" 2> "$OUT/$label.log"
+      --harness.id default --harness.enable_bash true --harness.runtime.type "$rt" \
+      --num_tasks 1 --num_rollouts "$r" \
+      --max_concurrent "$MAX_CONCURRENT" --max_turns "$MAX_TURNS" \
+      --retries.rollout.max_retries 0 --retries.model.max_retries 0 --retries.runtime.max_retries 0 \
+      --rich false --output_dir "$WORK/$label" \
+      > "$WORK/$label.stdout" 2> "$WORK/$label.log"
     rc=$?
-    echo "$w $r $(( $(date +%s) - start ))" >> "$OUT/e2e.txt"
-    echo "rc=$rc e2e=$(tail -1 "$OUT/e2e.txt" | awk '{print $3}')s"
+    echo "$rt $r $(( $(date +%s) - start ))" >> "$WORK/e2e.txt"
+    echo "rc=$rc e2e=$(tail -1 "$WORK/e2e.txt" | awk '{print $3}')s"
   done
 done
 
-# Aggregate into agentic_benchmark.json: per-(workers, rollouts) e2e + the per-rollout
-# generation-duration list (p10/p50/p90), reward, and error count.
-uv run python bench/bench_aggregate.py "$OUT" "$TASK ($RUNTIME, max_turns=$MAX_TURNS)" > "$OUT/agentic_benchmark.json"
-echo "wrote $OUT/agentic_benchmark.json"
+# One JSON per cell into the committed results dir (a subset run won't clobber other cells).
+uv run python bench/bench_aggregate.py "$WORK" "$RESULTS"
+echo "wrote per-cell results to $RESULTS"
diff --git a/bench/bench_aggregate.py b/bench/bench_aggregate.py
@@ -1,44 +1,66 @@
-"""Aggregate a worker-pool benchmark run (single-turn or agentic) into JSON.
+"""Aggregate a benchmark run into per-cell JSON — one file per matrix element.
 
-    python bench/bench_aggregate.py <out_dir> <label>
+    python bench/bench_aggregate.py <work_dir> <results_dir>
 
-`out_dir` holds one `w<workers>-r<rollouts>/` eval output dir per (workers, group size)
-plus `e2e.txt` (`<workers> <rollouts> <e2e_seconds>` lines). For each run we record the
-e2e wall clock, the full per-rollout `generation.duration` list (so p10/p50/p90 — e2e is
-straggler-gated, so the distribution is the honest comparator), reward, and error count.
-`workers=0` is the single in-process server; `>0` the worker pool. `label` is free-text
-run metadata (e.g. the taskset + runtime).
+`<work_dir>` holds one `<runtime>-r<rollouts>/` eval output dir per cell run this time, plus
+`e2e.txt` (`<runtime> <rollouts> <e2e_seconds>` lines). For each cell we write
+`<results_dir>/<runtime>-r<rollouts>.json` with the e2e wall clock, the resolved
+`max_concurrent`, reward / error count, and the sorted per-rollout duration lists for each
+stage — `setup` (provisioning), `generation`, `scoring`, and `total` (the whole rollout,
+`scoring.end - start`, so it also captures any between-stage gaps). Sorted lists let
+`plot.py` read off p10/p50/p90 — e2e is straggler-gated, so the distribution is the honest
+comparator. One file per cell means re-running a subset (e.g. just docker) refreshes only
+those cells and never clobbers the rest. `bench/plot.py` reads the whole results dir.
 """
 
 import glob
 import json
 import os
 import sys
+import tomllib
 
-out, label = sys.argv[1], sys.argv[2]
+work, results = sys.argv[1], sys.argv[2]
+os.makedirs(results, exist_ok=True)
 
-e2e: dict[tuple[int, int], int] = {}
-for line in open(os.path.join(out, "e2e.txt")):
-    w, r, secs = line.split()
-    e2e[(int(w), int(r))] = int(secs)
+e2e: dict[tuple[str, int], int] = {}
+for line in open(os.path.join(work, "e2e.txt")):
+    rt, r, secs = line.split()
+    e2e[(rt, int(r))] = int(secs)
 
-runs = []
-for d in sorted(p for p in glob.glob(f"{out}/*") if os.path.isdir(p)):
-    base = os.path.basename(d)  # w<workers>-r<rollouts>
-    workers = int(base.split("-")[0][1:])
-    rollouts = int(base.split("-")[1][1:])
+
+def stage(timing: dict, name: str) -> float:
+    return round(timing.get(name, {}).get("duration", 0.0), 3)
+
+
+def total(timing: dict) -> float:
+    # Whole-rollout wall time: start -> the last stage end (captures any between-stage gaps).
+    ends = [s["end"] for s in timing.values() if isinstance(s, dict) and "end" in s]
+    return round(max(ends) - timing["start"], 3) if ends else 0.0
+
+
+for d in sorted(p for p in glob.glob(f"{work}/*") if os.path.isdir(p)):
+    base = os.path.basename(d)  # <runtime>-r<rollouts>
+    runtime, _, rollouts = base.rpartition("-r")
+    rollouts = int(rollouts)
     rows = [json.loads(line) for line in open(f"{d}/results.jsonl")]
-    gens = sorted(r["timing"]["generation"]["duration"] for r in rows)
     rewards = [r["reward"] for r in rows if r.get("reward") is not None]
-    runs.append(
-        {
-            "workers": workers,
-            "rollouts": rollouts,
-            "e2e_s": e2e.get((workers, rollouts)),
-            "reward": round(sum(rewards) / len(rewards), 3) if rewards else None,
-            "errors": sum(1 for r in rows if r.get("errors")),
-            "gen_durations": [round(g, 3) for g in gens],
-        }
-    )
-
-print(json.dumps({"label": label, "runs": runs}, indent=2))
+    cfg = {}
+    if os.path.exists(f"{d}/config.toml"):
+        with open(f"{d}/config.toml", "rb") as f:
+            cfg = tomllib.load(f)
+    cell = {
+        "runtime": runtime,
+        "rollouts": rollouts,
+        "max_concurrent": cfg.get("max_concurrent"),  # absent in the dump => unbounded
+        "e2e_s": e2e.get((runtime, rollouts)),
+        "reward": round(sum(rewards) / len(rewards), 3) if rewards else None,
+        "errors": sum(1 for r in rows if r.get("errors")),
+        "setup_durations": sorted(stage(r["timing"], "setup") for r in rows),
+        "gen_durations": sorted(stage(r["timing"], "generation") for r in rows),
+        "scoring_durations": sorted(stage(r["timing"], "scoring") for r in rows),
+        "total_durations": sorted(total(r["timing"]) for r in rows),
+    }
+    out = os.path.join(results, f"{base}.json")
+    with open(out, "w") as f:
+        json.dump(cell, f, indent=2)
+    print(f"wrote {out}")
diff --git a/bench/benchmark.sh b/bench/benchmark.sh
@@ -1,47 +1,53 @@
 #!/usr/bin/env bash
-# Single-turn pool benchmark: run gsm8k-v1 at group sizes (-r, rollouts of ONE task) across
-# env-server modes (in-process vs N-worker pool) and write per-rollout durations + e2e, which
-# bench/bench_aggregate.py summarizes. The single-turn counterpart to agentic_benchmark.sh:
-# per-rollout CPU is light (no agent loop, no verifier), so this is where the pool's fixed
-# per-worker overhead is most visible against its event-loop relief.
+# Single-turn runtime benchmark: run gsm8k-v1 at group sizes (-r) across runtimes
+# (subprocess / docker / prime), writing ONE results file per matrix cell to
+# bench/results/single_turn/<runtime>-r<rollouts>.json — so re-running a subset (e.g. just
+# docker) refreshes only those cells and never clobbers the rest. Single-turn per-rollout
+# CPU is light, so this isolates each runtime's provisioning (setup) + round-trip overhead.
 #
 #   bench/benchmark.sh
-#   ROLLOUTS="32 64 128" WORKERS="0 4" RUNTIME=subprocess MAX_TOKENS=1024 bench/benchmark.sh
+#   RUNTIMES="docker" MAX_CONCURRENT=128 bench/benchmark.sh   # add/refresh just docker cells
 #
-# Compares WORKERS modes (0 = single in-process server, N = an N-worker pool), concurrency
-# capped at 512, default model. RUNTIME defaults to subprocess (no sandbox provisioning).
+# Concurrency defaults to unbounded; cap it with MAX_CONCURRENT (e.g. for docker, whose
+# per-container cold-install is disk-heavy at scale). gsm8k generation is unbounded (EOS).
 set -uo pipefail
 
 TASKSET="${TASKSET:-gsm8k-v1}"
-RUNTIME="${RUNTIME:-subprocess}"
-ROLLOUTS="${ROLLOUTS:-32 64 128}"
-WORKERS="${WORKERS:-0 4}"
-MAX_TOKENS="${MAX_TOKENS:-1024}"
+RUNTIMES="${RUNTIMES:-subprocess docker prime}"
+ROLLOUTS="${ROLLOUTS:-64 512}"
+MAX_TOKENS="${MAX_TOKENS:-}"            # empty = unbounded generation (gsm8k stops on EOS)
+MAX_CONCURRENT="${MAX_CONCURRENT:-None}"  # None = unbounded rollouts in flight
 MODEL="${MODEL:-deepseek/deepseek-v4-flash}"
 
 ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "$ROOT"
-set -a; . "$HOME/.env" 2>/dev/null || true; set +a
+# Creds for the model endpoint (+ prime runtime); skip the FIFO read if already in the env.
+[ -n "${PRIME_API_KEY:-}" ] || { set -a; . "$HOME/.env" 2>/dev/null || true; set +a; }
 
-OUT="/tmp/vbench/single-turn"
-rm -rf "$OUT"; mkdir -p "$OUT"; : > "$OUT/e2e.txt"
-for w in $WORKERS; do
+SAMPLING=()
+[ -n "$MAX_TOKENS" ] && SAMPLING+=(--sampling.max_tokens "$MAX_TOKENS")
+
+WORK="/tmp/vbench/single-turn"           # scratch (raw eval outputs), wiped each run
+RESULTS="$ROOT/bench/results/single_turn"  # committed per-cell results, NOT wiped
+rm -rf "$WORK"; mkdir -p "$WORK" "$RESULTS"; : > "$WORK/e2e.txt"
+for rt in $RUNTIMES; do
   for r in $ROLLOUTS; do
-    label="w$w-r$r"
-    echo "== $label (runtime=$RUNTIME max_tokens=$MAX_TOKENS) =="
+    label="$rt-r$r"
+    echo "== $label (max_concurrent=$MAX_CONCURRENT max_tokens=${MAX_TOKENS:-unbounded}) =="
     start=$(date +%s)
     uv run eval "$TASKSET" --harness.id default --harness.enable_bash false \
-      --harness.runtime.type "$RUNTIME" --num_tasks 1 --num_rollouts "$r" --num_workers "$w" \
-      --max_concurrent 512 --retry.attempts 1 --sampling.max_tokens "$MAX_TOKENS" \
-      -m "$MODEL" --rich false --output_dir "$OUT/$label" \
-      > "$OUT/$label.stdout" 2> "$OUT/$label.log"
+      --harness.runtime.type "$rt" --num_tasks 1 --num_rollouts "$r" \
+      --max_concurrent "$MAX_CONCURRENT" \
+      --retries.rollout.max_retries 0 --retries.model.max_retries 0 --retries.runtime.max_retries 0 \
+      "${SAMPLING[@]}" \
+      -m "$MODEL" --rich false --output_dir "$WORK/$label" \
+      > "$WORK/$label.stdout" 2> "$WORK/$label.log"
     rc=$?
-    echo "$w $r $(( $(date +%s) - start ))" >> "$OUT/e2e.txt"
-    echo "rc=$rc e2e=$(tail -1 "$OUT/e2e.txt" | awk '{print $3}')s"
+    echo "$rt $r $(( $(date +%s) - start ))" >> "$WORK/e2e.txt"
+    echo "rc=$rc e2e=$(tail -1 "$WORK/e2e.txt" | awk '{print $3}')s"
   done
 done
 
-# Aggregate into benchmark.json: per-(workers, rollouts) e2e + the per-rollout
-# generation-duration list (p10/p50/p90), reward, and error count.
-uv run python bench/bench_aggregate.py "$OUT" "$TASKSET ($RUNTIME)" > "$OUT/benchmark.json"
-echo "wrote $OUT/benchmark.json"
+# One JSON per cell into the committed results dir (a subset run won't clobber other cells).
+uv run python bench/bench_aggregate.py "$WORK" "$RESULTS"
+echo "wrote per-cell results to $RESULTS"
diff --git a/bench/plot.py b/bench/plot.py
@@ -0,0 +1,97 @@
+"""Plot a runtime benchmark (per-cell bench_aggregate.py output).
+
+    uv run --with matplotlib python bench/plot.py [results_dir] [out.png]
+
+Reads one JSON per cell from `<results_dir>` (default bench/results/single_turn). Three
+panels — setup (provisioning), generation, scoring — each a per-runtime grouped bar at p50
+with a p10..p90 whisker, grouped by rollout count. The distribution (not the straggler-gated
+e2e) is the honest comparator; the panels show where each runtime spends its time and how
+that scales with concurrency. PNG is committed (the gitignore excepts it).
+"""
+
+import glob
+import json
+import os
+import sys
+
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt  # noqa: E402
+
+
+def pct(xs: list[float], p: float) -> float:
+    if not xs:  # absent metric (e.g. a cell not re-run for this stage) -> skip the bar
+        return float("nan")
+    xs = sorted(xs)
+    k = (len(xs) - 1) * p / 100
+    lo = int(k)
+    hi = min(lo + 1, len(xs) - 1)
+    return xs[lo] + (xs[hi] - xs[lo]) * (k - lo)
+
+
+src = sys.argv[1] if len(sys.argv) > 1 else "bench/results/single_turn"
+out = sys.argv[2] if len(sys.argv) > 2 else src.rstrip("/") + ".png"
+cells = [json.load(open(f)) for f in sorted(glob.glob(os.path.join(src, "*.json")))]
+if not cells:
+    sys.exit(f"no cell JSON files in {src}")
+
+runtimes = sorted({c["runtime"] for c in cells})
+rollouts = sorted({c["rollouts"] for c in cells})
+by_key = {(c["runtime"], c["rollouts"]): c for c in cells}
+stages = [
+    ("setup", "setup_durations", "#d98c5f"),
+    ("generation", "gen_durations", "#5f8cd9"),
+    ("scoring", "scoring_durations", "#6fae6f"),
+]
+n = len(rollouts)
+# Lighter bar for smaller rollout counts, darkening with concurrency.
+shades = {
+    r: (0.45 + 0.55 * i / (n - 1) if n > 1 else 1.0) for i, r in enumerate(rollouts)
+}
+
+fig, axes = plt.subplots(1, len(stages), figsize=(5.5 * len(stages), 5.5))
+x = list(range(len(runtimes)))
+width = 0.8 / len(rollouts)
+for ax, (title, field, color) in zip(axes.flat, stages):
+    for j, r in enumerate(rollouts):
+        offs = [i + (j - (len(rollouts) - 1) / 2) * width for i in x]
+        p50 = [pct(by_key.get((rt, r), {}).get(field, []), 50) for rt in runtimes]
+        p10 = [pct(by_key.get((rt, r), {}).get(field, []), 10) for rt in runtimes]
+        p90 = [pct(by_key.get((rt, r), {}).get(field, []), 90) for rt in runtimes]
+        yerr = [[a - b for a, b in zip(p50, p10)], [a - b for a, b in zip(p90, p50)]]
+        ax.bar(
+            offs,
+            p50,
+            width,
+            yerr=yerr,
+            capsize=3,
+            label=f"r={r}",
+            color=color,
+            alpha=shades[r],
+            edgecolor="black",
+            linewidth=0.4,
+        )
+        for o, v, hi in zip(offs, p50, p90):
+            if v != v:  # NaN -> no data for this cell/stage
+                continue
+            ax.annotate(
+                f"{v:.0f}" if v >= 1 else f"{v:.1f}",
+                (o, hi),
+                textcoords="offset points",
+                xytext=(0, 3),
+                ha="center",
+                fontsize=7,
+                color="#333",
+            )
+    ax.set_title(f"{title}  (p50 bar, p10–p90 whisker)")
+    ax.set_ylabel("seconds / rollout")
+    ax.set_xticks(x)
+    ax.set_xticklabels(runtimes)
+    ax.margins(y=0.15)
+    ax.legend(title="rollouts", fontsize=8)
+
+fig.suptitle(f"runtime benchmark — {os.path.basename(src.rstrip('/'))}", fontsize=13)
+fig.tight_layout(rect=(0, 0, 1, 0.98))
+fig.savefig(out, dpi=120)
+print(f"wrote {out}")
diff --git a/bench/results/agentic.png b/bench/results/agentic.png