Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bench/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
*.png
!results/single_turn.png
!results/agentic.png
63 changes: 32 additions & 31 deletions bench/agentic_benchmark.sh
Original file line number Diff line number Diff line change
@@ -1,51 +1,52 @@
#!/usr/bin/env bash
# Agentic benchmark: run ONE harbor task at group sizes (-r) across env-server modes and
# write bench/agentic_benchmark.json (per-rollout durations + e2e wall clock), which
# bench/agentic_aggregate.py summarizes. Each rollout is its own sandbox (a coding agent +
# the harbor verifier); with no group reward the rollouts are independent, so the worker
# pool round-robins them across workers — this stresses concurrent agentic execution +
# scoring (where the single-loop server is most likely to stall).
# Agentic runtime benchmark: run ONE harbor task at group sizes (-r) across container
# runtimes (docker / prime), writing ONE results file per matrix cell to
# bench/results/agentic/<runtime>-r<rollouts>.json — so re-running a subset refreshes only
# those cells. Each rollout is its own container/sandbox (a coding agent + the harbor
# verifier); rollouts are independent, so this stresses concurrent agentic execution + scoring.
#
# bench/agentic_benchmark.sh
# ROLLOUTS="8 16" WORKERS="0 4" TASK=fix-git bench/agentic_benchmark.sh
# RUNTIMES="docker" MAX_CONCURRENT=64 TASK=fix-git bench/agentic_benchmark.sh
#
# Compares WORKERS modes: 0 = single in-process server, N = an N-worker pool. Needs the
# `harbor` CLI (`uv tool install harbor`) and the `terminal-bench-2-v1` example taskset
# (an editable dep), plus a container runtime (prime default; PRIME_API_KEY in ~/.env).
# Needs the `harbor` CLI (`uv tool install harbor`) and the `terminal-bench-2-v1` example
# taskset (an editable dep); docker needs the daemon, prime needs PRIME_API_KEY in ~/.env.
# Concurrency defaults to unbounded; cap it with MAX_CONCURRENT.
set -uo pipefail

TASKSET="${TASKSET:-terminal-bench-2-v1}"
TASK="${TASK:-fix-git}"
RUNTIME="${RUNTIME:-prime}"
ROLLOUTS="${ROLLOUTS:-32 64 128}"
WORKERS="${WORKERS:-0 4}"
RUNTIMES="${RUNTIMES:-docker prime}"
ROLLOUTS="${ROLLOUTS:-64 512}"
MAX_CONCURRENT="${MAX_CONCURRENT:-None}" # None = unbounded rollouts in flight
MODEL="${MODEL:-deepseek/deepseek-v4-flash}"
MAX_TURNS="${MAX_TURNS:-30}"
MAX_TURNS="${MAX_TURNS:-32}"

ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT"
set -a; . "$HOME/.env" 2>/dev/null || true; set +a
# Creds for the model endpoint (+ prime runtime); skip the FIFO read if already in the env.
[ -n "${PRIME_API_KEY:-}" ] || { set -a; . "$HOME/.env" 2>/dev/null || true; set +a; }

OUT="/tmp/vbench/agentic"
rm -rf "$OUT"; mkdir -p "$OUT"; : > "$OUT/e2e.txt"
for w in $WORKERS; do
WORK="/tmp/vbench/agentic" # scratch (raw eval outputs), wiped each run
RESULTS="$ROOT/bench/results/agentic" # committed per-cell results, NOT wiped
rm -rf "$WORK"; mkdir -p "$WORK" "$RESULTS"; : > "$WORK/e2e.txt"
for rt in $RUNTIMES; do
for r in $ROLLOUTS; do
label="w$w-r$r"
echo "== $label (task=$TASK runtime=$RUNTIME max_turns=$MAX_TURNS) =="
label="$rt-r$r"
echo "== $label (task=$TASK max_concurrent=$MAX_CONCURRENT max_turns=$MAX_TURNS) =="
start=$(date +%s)
uv run eval "$TASKSET" --taskset.tasks "[\"$TASK\"]" \
--harness.id default --harness.enable_bash true --harness.runtime.type "$RUNTIME" \
--num_tasks 1 --num_rollouts "$r" --num_workers "$w" \
--max_concurrent 512 --retry.attempts 1 --max_turns "$MAX_TURNS" \
--rich false --output_dir "$OUT/$label" \
> "$OUT/$label.stdout" 2> "$OUT/$label.log"
--harness.id default --harness.enable_bash true --harness.runtime.type "$rt" \
--num_tasks 1 --num_rollouts "$r" \
--max_concurrent "$MAX_CONCURRENT" --max_turns "$MAX_TURNS" \
--retries.rollout.max_retries 0 --retries.model.max_retries 0 --retries.runtime.max_retries 0 \
--rich false --output_dir "$WORK/$label" \
> "$WORK/$label.stdout" 2> "$WORK/$label.log"
rc=$?
echo "$w $r $(( $(date +%s) - start ))" >> "$OUT/e2e.txt"
echo "rc=$rc e2e=$(tail -1 "$OUT/e2e.txt" | awk '{print $3}')s"
echo "$rt $r $(( $(date +%s) - start ))" >> "$WORK/e2e.txt"
echo "rc=$rc e2e=$(tail -1 "$WORK/e2e.txt" | awk '{print $3}')s"
done
done

# Aggregate into agentic_benchmark.json: per-(workers, rollouts) e2e + the per-rollout
# generation-duration list (p10/p50/p90), reward, and error count.
uv run python bench/bench_aggregate.py "$OUT" "$TASK ($RUNTIME, max_turns=$MAX_TURNS)" > "$OUT/agentic_benchmark.json"
echo "wrote $OUT/agentic_benchmark.json"
# One JSON per cell into the committed results dir (a subset run won't clobber other cells).
uv run python bench/bench_aggregate.py "$WORK" "$RESULTS"
echo "wrote per-cell results to $RESULTS"
84 changes: 53 additions & 31 deletions bench/bench_aggregate.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,66 @@
"""Aggregate a worker-pool benchmark run (single-turn or agentic) into JSON.
"""Aggregate a benchmark run into per-cell JSON — one file per matrix element.

python bench/bench_aggregate.py <out_dir> <label>
python bench/bench_aggregate.py <work_dir> <results_dir>

`out_dir` holds one `w<workers>-r<rollouts>/` eval output dir per (workers, group size)
plus `e2e.txt` (`<workers> <rollouts> <e2e_seconds>` lines). For each run we record the
e2e wall clock, the full per-rollout `generation.duration` list (so p10/p50/p90 — e2e is
straggler-gated, so the distribution is the honest comparator), reward, and error count.
`workers=0` is the single in-process server; `>0` the worker pool. `label` is free-text
run metadata (e.g. the taskset + runtime).
`<work_dir>` holds one `<runtime>-r<rollouts>/` eval output dir per cell run this time, plus
`e2e.txt` (`<runtime> <rollouts> <e2e_seconds>` lines). For each cell we write
`<results_dir>/<runtime>-r<rollouts>.json` with the e2e wall clock, the resolved
`max_concurrent`, reward / error count, and the sorted per-rollout duration lists for each
stage — `setup` (provisioning), `generation`, `scoring`, and `total` (the whole rollout,
`scoring.end - start`, so it also captures any between-stage gaps). Sorted lists let
`plot.py` read off p10/p50/p90 — e2e is straggler-gated, so the distribution is the honest
comparator. One file per cell means re-running a subset (e.g. just docker) refreshes only
those cells and never clobbers the rest. `bench/plot.py` reads the whole results dir.
"""

import glob
import json
import os
import sys
import tomllib

out, label = sys.argv[1], sys.argv[2]
work, results = sys.argv[1], sys.argv[2]
os.makedirs(results, exist_ok=True)

e2e: dict[tuple[int, int], int] = {}
for line in open(os.path.join(out, "e2e.txt")):
w, r, secs = line.split()
e2e[(int(w), int(r))] = int(secs)
e2e: dict[tuple[str, int], int] = {}
for line in open(os.path.join(work, "e2e.txt")):
rt, r, secs = line.split()
e2e[(rt, int(r))] = int(secs)

runs = []
for d in sorted(p for p in glob.glob(f"{out}/*") if os.path.isdir(p)):
base = os.path.basename(d) # w<workers>-r<rollouts>
workers = int(base.split("-")[0][1:])
rollouts = int(base.split("-")[1][1:])

def stage(timing: dict, name: str) -> float:
return round(timing.get(name, {}).get("duration", 0.0), 3)


def total(timing: dict) -> float:
# Whole-rollout wall time: start -> the last stage end (captures any between-stage gaps).
ends = [s["end"] for s in timing.values() if isinstance(s, dict) and "end" in s]
return round(max(ends) - timing["start"], 3) if ends else 0.0


for d in sorted(p for p in glob.glob(f"{work}/*") if os.path.isdir(p)):
base = os.path.basename(d) # <runtime>-r<rollouts>
runtime, _, rollouts = base.rpartition("-r")
rollouts = int(rollouts)
rows = [json.loads(line) for line in open(f"{d}/results.jsonl")]
gens = sorted(r["timing"]["generation"]["duration"] for r in rows)
rewards = [r["reward"] for r in rows if r.get("reward") is not None]
runs.append(
{
"workers": workers,
"rollouts": rollouts,
"e2e_s": e2e.get((workers, rollouts)),
"reward": round(sum(rewards) / len(rewards), 3) if rewards else None,
"errors": sum(1 for r in rows if r.get("errors")),
"gen_durations": [round(g, 3) for g in gens],
}
)

print(json.dumps({"label": label, "runs": runs}, indent=2))
cfg = {}
if os.path.exists(f"{d}/config.toml"):
with open(f"{d}/config.toml", "rb") as f:
cfg = tomllib.load(f)
cell = {
"runtime": runtime,
"rollouts": rollouts,
"max_concurrent": cfg.get("max_concurrent"), # absent in the dump => unbounded
"e2e_s": e2e.get((runtime, rollouts)),
"reward": round(sum(rewards) / len(rewards), 3) if rewards else None,
"errors": sum(1 for r in rows if r.get("errors")),
"setup_durations": sorted(stage(r["timing"], "setup") for r in rows),
"gen_durations": sorted(stage(r["timing"], "generation") for r in rows),
"scoring_durations": sorted(stage(r["timing"], "scoring") for r in rows),
"total_durations": sorted(total(r["timing"]) for r in rows),
}
out = os.path.join(results, f"{base}.json")
with open(out, "w") as f:
json.dump(cell, f, indent=2)
print(f"wrote {out}")
62 changes: 34 additions & 28 deletions bench/benchmark.sh
Original file line number Diff line number Diff line change
@@ -1,47 +1,53 @@
#!/usr/bin/env bash
# Single-turn pool benchmark: run gsm8k-v1 at group sizes (-r, rollouts of ONE task) across
# env-server modes (in-process vs N-worker pool) and write per-rollout durations + e2e, which
# bench/bench_aggregate.py summarizes. The single-turn counterpart to agentic_benchmark.sh:
# per-rollout CPU is light (no agent loop, no verifier), so this is where the pool's fixed
# per-worker overhead is most visible against its event-loop relief.
# Single-turn runtime benchmark: run gsm8k-v1 at group sizes (-r) across runtimes
# (subprocess / docker / prime), writing ONE results file per matrix cell to
# bench/results/single_turn/<runtime>-r<rollouts>.json — so re-running a subset (e.g. just
# docker) refreshes only those cells and never clobbers the rest. Single-turn per-rollout
# CPU is light, so this isolates each runtime's provisioning (setup) + round-trip overhead.
#
# bench/benchmark.sh
# ROLLOUTS="32 64 128" WORKERS="0 4" RUNTIME=subprocess MAX_TOKENS=1024 bench/benchmark.sh
# RUNTIMES="docker" MAX_CONCURRENT=128 bench/benchmark.sh # add/refresh just docker cells
#
# Compares WORKERS modes (0 = single in-process server, N = an N-worker pool), concurrency
# capped at 512, default model. RUNTIME defaults to subprocess (no sandbox provisioning).
# Concurrency defaults to unbounded; cap it with MAX_CONCURRENT (e.g. for docker, whose
# per-container cold-install is disk-heavy at scale). gsm8k generation is unbounded (EOS).
set -uo pipefail

TASKSET="${TASKSET:-gsm8k-v1}"
RUNTIME="${RUNTIME:-subprocess}"
ROLLOUTS="${ROLLOUTS:-32 64 128}"
WORKERS="${WORKERS:-0 4}"
MAX_TOKENS="${MAX_TOKENS:-1024}"
RUNTIMES="${RUNTIMES:-subprocess docker prime}"
ROLLOUTS="${ROLLOUTS:-64 512}"
MAX_TOKENS="${MAX_TOKENS:-}" # empty = unbounded generation (gsm8k stops on EOS)
MAX_CONCURRENT="${MAX_CONCURRENT:-None}" # None = unbounded rollouts in flight
MODEL="${MODEL:-deepseek/deepseek-v4-flash}"

ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT"
set -a; . "$HOME/.env" 2>/dev/null || true; set +a
# Creds for the model endpoint (+ prime runtime); skip the FIFO read if already in the env.
[ -n "${PRIME_API_KEY:-}" ] || { set -a; . "$HOME/.env" 2>/dev/null || true; set +a; }

OUT="/tmp/vbench/single-turn"
rm -rf "$OUT"; mkdir -p "$OUT"; : > "$OUT/e2e.txt"
for w in $WORKERS; do
SAMPLING=()
[ -n "$MAX_TOKENS" ] && SAMPLING+=(--sampling.max_tokens "$MAX_TOKENS")

WORK="/tmp/vbench/single-turn" # scratch (raw eval outputs), wiped each run
RESULTS="$ROOT/bench/results/single_turn" # committed per-cell results, NOT wiped
rm -rf "$WORK"; mkdir -p "$WORK" "$RESULTS"; : > "$WORK/e2e.txt"
for rt in $RUNTIMES; do
for r in $ROLLOUTS; do
label="w$w-r$r"
echo "== $label (runtime=$RUNTIME max_tokens=$MAX_TOKENS) =="
label="$rt-r$r"
echo "== $label (max_concurrent=$MAX_CONCURRENT max_tokens=${MAX_TOKENS:-unbounded}) =="
start=$(date +%s)
uv run eval "$TASKSET" --harness.id default --harness.enable_bash false \
--harness.runtime.type "$RUNTIME" --num_tasks 1 --num_rollouts "$r" --num_workers "$w" \
--max_concurrent 512 --retry.attempts 1 --sampling.max_tokens "$MAX_TOKENS" \
-m "$MODEL" --rich false --output_dir "$OUT/$label" \
> "$OUT/$label.stdout" 2> "$OUT/$label.log"
--harness.runtime.type "$rt" --num_tasks 1 --num_rollouts "$r" \
--max_concurrent "$MAX_CONCURRENT" \
--retries.rollout.max_retries 0 --retries.model.max_retries 0 --retries.runtime.max_retries 0 \
"${SAMPLING[@]}" \
-m "$MODEL" --rich false --output_dir "$WORK/$label" \
> "$WORK/$label.stdout" 2> "$WORK/$label.log"
rc=$?
echo "$w $r $(( $(date +%s) - start ))" >> "$OUT/e2e.txt"
echo "rc=$rc e2e=$(tail -1 "$OUT/e2e.txt" | awk '{print $3}')s"
echo "$rt $r $(( $(date +%s) - start ))" >> "$WORK/e2e.txt"
echo "rc=$rc e2e=$(tail -1 "$WORK/e2e.txt" | awk '{print $3}')s"
done
done

# Aggregate into benchmark.json: per-(workers, rollouts) e2e + the per-rollout
# generation-duration list (p10/p50/p90), reward, and error count.
uv run python bench/bench_aggregate.py "$OUT" "$TASKSET ($RUNTIME)" > "$OUT/benchmark.json"
echo "wrote $OUT/benchmark.json"
# One JSON per cell into the committed results dir (a subset run won't clobber other cells).
uv run python bench/bench_aggregate.py "$WORK" "$RESULTS"
echo "wrote per-cell results to $RESULTS"
97 changes: 97 additions & 0 deletions bench/plot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
"""Plot a runtime benchmark (per-cell bench_aggregate.py output).

uv run --with matplotlib python bench/plot.py [results_dir] [out.png]

Reads one JSON per cell from `<results_dir>` (default bench/results/single_turn). Three
panels — setup (provisioning), generation, scoring — each a per-runtime grouped bar at p50
with a p10..p90 whisker, grouped by rollout count. The distribution (not the straggler-gated
e2e) is the honest comparator; the panels show where each runtime spends its time and how
that scales with concurrency. PNG is committed (the gitignore excepts it).
"""

import glob
import json
import os
import sys

import matplotlib

matplotlib.use("Agg")
import matplotlib.pyplot as plt # noqa: E402


def pct(xs: list[float], p: float) -> float:
if not xs: # absent metric (e.g. a cell not re-run for this stage) -> skip the bar
return float("nan")
xs = sorted(xs)
k = (len(xs) - 1) * p / 100
lo = int(k)
hi = min(lo + 1, len(xs) - 1)
return xs[lo] + (xs[hi] - xs[lo]) * (k - lo)


src = sys.argv[1] if len(sys.argv) > 1 else "bench/results/single_turn"
out = sys.argv[2] if len(sys.argv) > 2 else src.rstrip("/") + ".png"
cells = [json.load(open(f)) for f in sorted(glob.glob(os.path.join(src, "*.json")))]
if not cells:
sys.exit(f"no cell JSON files in {src}")

runtimes = sorted({c["runtime"] for c in cells})
rollouts = sorted({c["rollouts"] for c in cells})
by_key = {(c["runtime"], c["rollouts"]): c for c in cells}
stages = [
("setup", "setup_durations", "#d98c5f"),
("generation", "gen_durations", "#5f8cd9"),
("scoring", "scoring_durations", "#6fae6f"),
]
n = len(rollouts)
# Lighter bar for smaller rollout counts, darkening with concurrency.
shades = {
r: (0.45 + 0.55 * i / (n - 1) if n > 1 else 1.0) for i, r in enumerate(rollouts)
}

fig, axes = plt.subplots(1, len(stages), figsize=(5.5 * len(stages), 5.5))
x = list(range(len(runtimes)))
width = 0.8 / len(rollouts)
for ax, (title, field, color) in zip(axes.flat, stages):
for j, r in enumerate(rollouts):
offs = [i + (j - (len(rollouts) - 1) / 2) * width for i in x]
p50 = [pct(by_key.get((rt, r), {}).get(field, []), 50) for rt in runtimes]
p10 = [pct(by_key.get((rt, r), {}).get(field, []), 10) for rt in runtimes]
p90 = [pct(by_key.get((rt, r), {}).get(field, []), 90) for rt in runtimes]
yerr = [[a - b for a, b in zip(p50, p10)], [a - b for a, b in zip(p90, p50)]]
ax.bar(
offs,
p50,
width,
yerr=yerr,
capsize=3,
label=f"r={r}",
color=color,
alpha=shades[r],
edgecolor="black",
linewidth=0.4,
)
for o, v, hi in zip(offs, p50, p90):
if v != v: # NaN -> no data for this cell/stage
continue
ax.annotate(
f"{v:.0f}" if v >= 1 else f"{v:.1f}",
(o, hi),
textcoords="offset points",
xytext=(0, 3),
ha="center",
fontsize=7,
color="#333",
)
ax.set_title(f"{title} (p50 bar, p10–p90 whisker)")
ax.set_ylabel("seconds / rollout")
ax.set_xticks(x)
ax.set_xticklabels(runtimes)
ax.margins(y=0.15)
ax.legend(title="rollouts", fontsize=8)

fig.suptitle(f"runtime benchmark — {os.path.basename(src.rstrip('/'))}", fontsize=13)
fig.tight_layout(rect=(0, 0, 1, 0.98))
fig.savefig(out, dpi=120)
print(f"wrote {out}")
Binary file added bench/results/agentic.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Loading