diff --git a/bench/.gitignore b/bench/.gitignore index e33609d25..127c3e692 100644 --- a/bench/.gitignore +++ b/bench/.gitignore @@ -1 +1,3 @@ *.png +!results/single_turn.png +!results/agentic.png diff --git a/bench/agentic_benchmark.sh b/bench/agentic_benchmark.sh index 39cf974e7..65292bbbe 100755 --- a/bench/agentic_benchmark.sh +++ b/bench/agentic_benchmark.sh @@ -1,51 +1,52 @@ #!/usr/bin/env bash -# Agentic benchmark: run ONE harbor task at group sizes (-r) across env-server modes and -# write bench/agentic_benchmark.json (per-rollout durations + e2e wall clock), which -# bench/agentic_aggregate.py summarizes. Each rollout is its own sandbox (a coding agent + -# the harbor verifier); with no group reward the rollouts are independent, so the worker -# pool round-robins them across workers — this stresses concurrent agentic execution + -# scoring (where the single-loop server is most likely to stall). +# Agentic runtime benchmark: run ONE harbor task at group sizes (-r) across container +# runtimes (docker / prime), writing ONE results file per matrix cell to +# bench/results/agentic/-r.json — so re-running a subset refreshes only +# those cells. Each rollout is its own container/sandbox (a coding agent + the harbor +# verifier); rollouts are independent, so this stresses concurrent agentic execution + scoring. # # bench/agentic_benchmark.sh -# ROLLOUTS="8 16" WORKERS="0 4" TASK=fix-git bench/agentic_benchmark.sh +# RUNTIMES="docker" MAX_CONCURRENT=64 TASK=fix-git bench/agentic_benchmark.sh # -# Compares WORKERS modes: 0 = single in-process server, N = an N-worker pool. Needs the -# `harbor` CLI (`uv tool install harbor`) and the `terminal-bench-2-v1` example taskset -# (an editable dep), plus a container runtime (prime default; PRIME_API_KEY in ~/.env). +# Needs the `harbor` CLI (`uv tool install harbor`) and the `terminal-bench-2-v1` example +# taskset (an editable dep); docker needs the daemon, prime needs PRIME_API_KEY in ~/.env. +# Concurrency defaults to unbounded; cap it with MAX_CONCURRENT. set -uo pipefail TASKSET="${TASKSET:-terminal-bench-2-v1}" TASK="${TASK:-fix-git}" -RUNTIME="${RUNTIME:-prime}" -ROLLOUTS="${ROLLOUTS:-32 64 128}" -WORKERS="${WORKERS:-0 4}" +RUNTIMES="${RUNTIMES:-docker prime}" +ROLLOUTS="${ROLLOUTS:-64 512}" +MAX_CONCURRENT="${MAX_CONCURRENT:-None}" # None = unbounded rollouts in flight MODEL="${MODEL:-deepseek/deepseek-v4-flash}" -MAX_TURNS="${MAX_TURNS:-30}" +MAX_TURNS="${MAX_TURNS:-32}" ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" cd "$ROOT" -set -a; . "$HOME/.env" 2>/dev/null || true; set +a +# Creds for the model endpoint (+ prime runtime); skip the FIFO read if already in the env. +[ -n "${PRIME_API_KEY:-}" ] || { set -a; . "$HOME/.env" 2>/dev/null || true; set +a; } -OUT="/tmp/vbench/agentic" -rm -rf "$OUT"; mkdir -p "$OUT"; : > "$OUT/e2e.txt" -for w in $WORKERS; do +WORK="/tmp/vbench/agentic" # scratch (raw eval outputs), wiped each run +RESULTS="$ROOT/bench/results/agentic" # committed per-cell results, NOT wiped +rm -rf "$WORK"; mkdir -p "$WORK" "$RESULTS"; : > "$WORK/e2e.txt" +for rt in $RUNTIMES; do for r in $ROLLOUTS; do - label="w$w-r$r" - echo "== $label (task=$TASK runtime=$RUNTIME max_turns=$MAX_TURNS) ==" + label="$rt-r$r" + echo "== $label (task=$TASK max_concurrent=$MAX_CONCURRENT max_turns=$MAX_TURNS) ==" start=$(date +%s) uv run eval "$TASKSET" --taskset.tasks "[\"$TASK\"]" \ - --harness.id default --harness.enable_bash true --harness.runtime.type "$RUNTIME" \ - --num_tasks 1 --num_rollouts "$r" --num_workers "$w" \ - --max_concurrent 512 --retry.attempts 1 --max_turns "$MAX_TURNS" \ - --rich false --output_dir "$OUT/$label" \ - > "$OUT/$label.stdout" 2> "$OUT/$label.log" + --harness.id default --harness.enable_bash true --harness.runtime.type "$rt" \ + --num_tasks 1 --num_rollouts "$r" \ + --max_concurrent "$MAX_CONCURRENT" --max_turns "$MAX_TURNS" \ + --retries.rollout.max_retries 0 --retries.model.max_retries 0 --retries.runtime.max_retries 0 \ + --rich false --output_dir "$WORK/$label" \ + > "$WORK/$label.stdout" 2> "$WORK/$label.log" rc=$? - echo "$w $r $(( $(date +%s) - start ))" >> "$OUT/e2e.txt" - echo "rc=$rc e2e=$(tail -1 "$OUT/e2e.txt" | awk '{print $3}')s" + echo "$rt $r $(( $(date +%s) - start ))" >> "$WORK/e2e.txt" + echo "rc=$rc e2e=$(tail -1 "$WORK/e2e.txt" | awk '{print $3}')s" done done -# Aggregate into agentic_benchmark.json: per-(workers, rollouts) e2e + the per-rollout -# generation-duration list (p10/p50/p90), reward, and error count. -uv run python bench/bench_aggregate.py "$OUT" "$TASK ($RUNTIME, max_turns=$MAX_TURNS)" > "$OUT/agentic_benchmark.json" -echo "wrote $OUT/agentic_benchmark.json" +# One JSON per cell into the committed results dir (a subset run won't clobber other cells). +uv run python bench/bench_aggregate.py "$WORK" "$RESULTS" +echo "wrote per-cell results to $RESULTS" diff --git a/bench/bench_aggregate.py b/bench/bench_aggregate.py index 854be00a7..a529f4001 100644 --- a/bench/bench_aggregate.py +++ b/bench/bench_aggregate.py @@ -1,44 +1,66 @@ -"""Aggregate a worker-pool benchmark run (single-turn or agentic) into JSON. +"""Aggregate a benchmark run into per-cell JSON — one file per matrix element. - python bench/bench_aggregate.py