Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
92 commits
Select commit Hold shift + click to select a range
43714c2
feat: add vf-nano as submodule
mikasenghaas Jun 8, 2026
4c6b282
chore: bump deps/vf-nano to feat/env-server (EnvServer)
mikasenghaas Jun 8, 2026
37d7f10
feat: run orchestrator on a vf-nano env server (reverse-text)
mikasenghaas Jun 8, 2026
0cd15df
feat: consume vf-nano Trace natively (branches→samples, shared render…
mikasenghaas Jun 8, 2026
65e1ff1
refactor: pass typed ClientConfig/SamplingConfig to the env client (n…
mikasenghaas Jun 8, 2026
daac755
refactor: orchestrator holds a typed vf.Trace[EnvTask] (no dicts)
mikasenghaas Jun 8, 2026
a71dfff
chore: depend on vf-nano[serve]; bump submodule
mikasenghaas Jun 8, 2026
525ced0
chore: bump vf-nano (client docstring cleanup)
mikasenghaas Jun 8, 2026
869665f
chore: drop redundant forward-ref quotes in advantage.py
mikasenghaas Jun 8, 2026
ef6fe5f
refactor: rename FinishedRollout.raw -> trace
mikasenghaas Jun 8, 2026
3b8210d
refactor: simplify FinishedRollout, read straight off the typed Trace
mikasenghaas Jun 8, 2026
671faee
refactor: spawn env server on an OS-assigned port, drop the startup poll
mikasenghaas Jun 8, 2026
ee01818
refactor: use vf.task_type instead of a local resolve_task_type
mikasenghaas Jun 8, 2026
3297663
feat: restore backfill_rollout_tokens for SFT (typed Trace)
mikasenghaas Jun 8, 2026
3079e02
fix: pass task_idx when building Cancelled traces on off-policy drop
mikasenghaas Jun 8, 2026
6382ec1
refactor: consume typed Trace[WireTask]; inline synthetic error traces
mikasenghaas Jun 8, 2026
0ea7584
feat: env-server file logging; align debug config batch/group to cano…
mikasenghaas Jun 8, 2026
67cbfd5
fix: don't double the envs/ segment in the env-server log path
mikasenghaas Jun 8, 2026
a1e9d5e
chore: bump vf-nano (Error.traceback str | None)
mikasenghaas Jun 8, 2026
5bb5d2e
chore: bump vf-nano (to_wire ordering)
mikasenghaas Jun 8, 2026
17c5611
feat: launch env servers as separate processes from the rl entrypoint
mikasenghaas Jun 8, 2026
50bf8f5
refactor: env servers use fixed configurable ports, not get_free_port
mikasenghaas Jun 8, 2026
e7cbdc9
refactor: separate train/eval env-server port blocks
mikasenghaas Jun 8, 2026
e30fc29
fix: env-server logs + sidecar queue cleanup; train-only debug config
mikasenghaas Jun 8, 2026
3f15d8c
chore: bump vf-nano (BaseRequest marker, no request_type field)
mikasenghaas Jun 8, 2026
5859660
chore: env client uses client= (was client_config=); bump vf-nano
mikasenghaas Jun 8, 2026
7a9651a
chore: bump vf-nano (drop renderers dep comment)
mikasenghaas Jun 8, 2026
614ab80
chore: bump vf-nano (configs/ + cli/ split, serve/ runtime-only)
mikasenghaas Jun 8, 2026
889d342
refactor: drop FinishedRollout.to_dict; serialize the Trace to disk d…
mikasenghaas Jun 8, 2026
69e41fb
refactor: track vf-nano agent->harness rename
mikasenghaas Jun 8, 2026
e3382ea
chore: track vf-nano plugin reorg (reverse-text path + bump)
mikasenghaas Jun 8, 2026
db83f08
feat: reuse vf.EnvConfig in the orchestrator (typed taskset/harness, …
mikasenghaas Jun 8, 2026
1e1b924
refactor: inherit vf's shared plugin resolution; trim dead EnvConfig …
mikasenghaas Jun 8, 2026
bad542c
refactor: require a configured taskset per env (no reverse-text default)
mikasenghaas Jun 8, 2026
c132ad4
chore: bump deps/vf-nano (dashboard taskset.id/harness.id fix)
mikasenghaas Jun 8, 2026
d9c5590
chore: bump deps/vf-nano (sampling max_tokens fix)
mikasenghaas Jun 8, 2026
f8c27e3
fix(env-server): require a configured env; build EnvServer from the E…
mikasenghaas Jun 9, 2026
f1f24a7
feat(nano): reverse-text uses default harness with enable_bash=false
mikasenghaas Jun 9, 2026
890f3c5
feat(nano): hendrycks-sanity config on the math-env taskset
mikasenghaas Jun 9, 2026
f72d41c
chore: bump deps/vf-nano to merged main (math/aime tasksets, fixes)
mikasenghaas Jun 9, 2026
e8616d1
chore: drop deps/verifiers, bump deps/vf-nano to merged main
mikasenghaas Jun 9, 2026
81b6beb
feat(nano): hendrycks-sanity mirrors examples/hendrycks_sanity + slur…
mikasenghaas Jun 9, 2026
ef82afc
chore(nano): fold wandb + slurm into hendrycks_sanity config
mikasenghaas Jun 9, 2026
4f75608
chore(nano): fold output_dir into hendrycks_sanity config, drop slurm…
mikasenghaas Jun 9, 2026
e8729da
feat(vf-v1): run on the unified verifiers package (v0 envs + v1=nano …
mikasenghaas Jun 9, 2026
8da314f
test(v0): wire alphabet-sort (multi-turn) + bump verifiers (State scrub)
mikasenghaas Jun 9, 2026
ded51b9
test(v0): wire wordle + add configs/wordle/rl.toml (2-GPU)
mikasenghaas Jun 9, 2026
6dd7c74
chore: bump deps/verifiers (v1 hygiene: init.py/tests/docs)
mikasenghaas Jun 9, 2026
414b803
refactor(v1): track de-vendored verifiers, rename nano -> v1
mikasenghaas Jun 9, 2026
6159f85
chore(v1): bump verifiers (eval/serve, v1 deps in base, bundled plugins)
mikasenghaas Jun 9, 2026
54751e5
chore(v1): bump verifiers (Task.system_prompt); write full rollout jsonl
mikasenghaas Jun 9, 2026
fb4b73f
chore: drop semgrep from uv.lock (verifiers retired the policy group)
mikasenghaas Jun 9, 2026
ede49bb
fix: pin the env-server renderer tokenizer to the base model for LoRA…
mikasenghaas Jun 9, 2026
5ce3bf6
feat(orchestrator): request vLLM token ids for MITO training (#2745)
mikasenghaas Jun 9, 2026
1827776
chore(v1): bump verifiers submodule; restore v0 env catalog + v1 -v1 …
mikasenghaas Jun 9, 2026
ac40f8e
chore(v1): bump verifiers submodule to feat/nano-as-v1 tip (textarena…
mikasenghaas Jun 9, 2026
e1912a9
chore(v1): bump verifiers submodule to feat/nano-as-v1 tip
mikasenghaas Jun 9, 2026
48dfa21
chore(v1): bump verifiers submodule to feat/nano-as-v1 tip
mikasenghaas Jun 10, 2026
193a69d
chore(v1): add v1 alphabet-sort debug config (port of examples/alphab…
mikasenghaas Jun 10, 2026
e6062e9
fix: forward extra_env_kwargs to v0 legacy envs; drop dead trajectory…
mikasenghaas Jun 10, 2026
0cb47b3
fix: install the bundled `tasksets` package (harbor-v1, textarena-v1)
mikasenghaas Jun 10, 2026
eb2c589
feat: consume the v1 message-graph trace (#2763)
mikasenghaas Jun 10, 2026
d6522de
chore(v1): bump verifiers submodule to feat/nano-as-v1 tip
mikasenghaas Jun 10, 2026
9b9f002
chore(v1): bump verifiers submodule to feat/nano-as-v1 tip
mikasenghaas Jun 10, 2026
3729338
chore(v1): fix stale sft.toml teacher comments
mikasenghaas Jun 10, 2026
7adb38e
chore(v1): bump verifiers (v0 eval chat-completions client)
mikasenghaas Jun 10, 2026
7677997
feat(v1): scaleswe configs + taskset registration (#2765)
mikasenghaas Jun 11, 2026
85b27cf
feat(v1): multimodal training through the message graph + color-codew…
mikasenghaas Jun 11, 2026
e4800fe
Revert "feat(v1): multimodal training through the message graph + col…
mikasenghaas Jun 11, 2026
7d44121
feat: multimodal training through the v1 message graph + color-codewo…
mikasenghaas Jun 11, 2026
2b9323e
feat: thread num_workers to the env-server worker pool (#2768)
mikasenghaas Jun 11, 2026
9b74f8b
chore(v1): bump verifiers to be76cbc3 (env-server worker pool)
mikasenghaas Jun 11, 2026
ff51b6a
feat(v1): register r2e-gym-v1 taskset
mikasenghaas Jun 11, 2026
b5062a1
chore(swe): use r2e-gym for rlm_swe configs (v0 + v1)
mikasenghaas Jun 11, 2026
7c4c359
chore(swe): point rlm_swe configs at r2e-gym (content)
mikasenghaas Jun 11, 2026
17d1ab5
fix(v1): restore env-server worker logs to the env log file (#2770)
mikasenghaas Jun 11, 2026
15c1856
chore(v1): bump verifiers to 955b6cdf (dashboard token usage fallback)
mikasenghaas Jun 11, 2026
6fba300
chore(v1): bump verifiers to 8e4ad735 (clean env-server teardown)
mikasenghaas Jun 11, 2026
1f64004
chore(deps): bump verifiers (renderers floor 0.1.8.dev40)
mikasenghaas Jun 11, 2026
77c75a9
chore(v1): bump verifiers to db82b38a (reap subprocess tree on cancel)
mikasenghaas Jun 11, 2026
a0ffc31
feat: elastic env-server pool (inherit static/elastic pool config) (#…
mikasenghaas Jun 11, 2026
01e3d27
chore(v1): bump verifiers to f404e97f (elastic env-server pool)
mikasenghaas Jun 11, 2026
9ae4ee3
chore(v1): bump verifiers to 88e9bedd
mikasenghaas Jun 11, 2026
911df77
chore(v1): bump verifiers to 40a2e89f (fix trace.timing.*.duration to…
S1ro1 Jun 11, 2026
123ebfb
chore(v1): bump verifiers to 5dc084f5
mikasenghaas Jun 11, 2026
cf0c966
chore(v1): bump verifiers to 472622ba
mikasenghaas Jun 11, 2026
33879bf
chore: stop importing env modules in the orchestrator (always Rollout…
mikasenghaas Jun 11, 2026
e52471c
chore(v1): bump verifiers to 7270e69b
mikasenghaas Jun 12, 2026
aa70aba
chore(v1): bump verifiers to 66c87d5b
mikasenghaas Jun 12, 2026
0fa5eb4
chore(v1): bump verifiers to ef45f720
mikasenghaas Jun 12, 2026
74835f7
chore(v1): bump verifiers pin (alphabet-sort host user sim, #1645)
mikasenghaas Jun 12, 2026
8cba109
chore(v1): bump verifiers pin (modal creates_per_sec 5 -> 40, #1646)
mikasenghaas Jun 12, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/cpu_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,11 @@ jobs:
from prime_rl.utils.config import BaseConfig, find_package_resource, rgetattr, rsetattr
from prime_rl.utils.validation import validate_shared_ckpt_config

# `verifiers` (+ its `datasets` dep) is a declared slim dep: the v1 config types
# (EnvConfig, Task, ...) extend verifiers.v1, which is pure-pydantic and pulls no
# GPU/ML deps. We still forbid the actual heavy training deps below.
forbidden = ["torch", "transformers", "vllm", "wandb", "ring_flash_attn",
"verifiers", "prime", "datasets", "liger_kernel", "loguru"]
"prime", "liger_kernel", "loguru"]
leaked = [m for m in forbidden if m in sys.modules]
if leaked:
raise SystemExit(f"slim install leaked heavy deps into sys.modules: {leaked}")
Expand Down
7 changes: 4 additions & 3 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
[submodule "verifiers"]
path = deps/verifiers
url = git@github.com:PrimeIntellect-ai/verifiers.git
[submodule "renderers"]
path = deps/renderers
url = git@github.com:PrimeIntellect-ai/renderers.git
Expand All @@ -13,3 +10,7 @@
[submodule "pydantic-config"]
path = deps/pydantic-config
url = https://github.com/PrimeIntellect-ai/pydantic-config
[submodule "deps/verifiers"]
path = deps/verifiers
url = git@github.com:PrimeIntellect-ai/verifiers.git
branch = feat/nano-as-v1
4 changes: 4 additions & 0 deletions configs/ci/integration/reverse_text_rl_sft/start.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ training_mode = "sft"
batch_size = 128
group_size = 16

# Teacher rolls out over plain chat-completions (no tokens); the renderer backfills them.
[orchestrator.renderer]
name = "qwen3"

[orchestrator.train.sampling]
max_completion_tokens = 128

Expand Down
31 changes: 0 additions & 31 deletions configs/debug/reverse_text_v1.toml

This file was deleted.

7 changes: 1 addition & 6 deletions configs/debug/training_modes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@ Minimal end-to-end configs for the three training modes (`rl` / `opd` / `sft`) a
| `opd_lora.toml` | `opd` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | trains a LoRA adapter (rank 8) |
| `sft.toml` | `sft` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | |
| `sft_lora.toml` | `sft` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | trains a LoRA adapter (rank 8) |
| `sft_external.toml` | `sft` | PI inference (`openai/gpt-5-mini`) | external OAI endpoint; no local teacher |

The student inference server is auto-launched on GPU 0 at `http://localhost:8000/v1` with `gpu_memory_utilization=0.5`. The local teacher (used by everything except `rl.toml` and `sft_external.toml`) is **not** auto-launched — start it manually on GPU 1.
The student inference server is auto-launched on GPU 0 at `http://localhost:8000/v1` with `gpu_memory_utilization=0.5`. The local teacher (used by everything except `rl.toml`) is **not** auto-launched — start it manually on GPU 1.

## Start the local teacher

Expand All @@ -38,10 +37,6 @@ uv run rl @ configs/debug/training_modes/opd_lora.toml
# SFT hard distill (needs teacher on port 8001)
uv run rl @ configs/debug/training_modes/sft.toml
uv run rl @ configs/debug/training_modes/sft_lora.toml

# SFT hard distill from openai/gpt-5-mini via PI inference
# (requires PRIME_API_KEY + PRIME_TEAM_ID in env; no local teacher needed)
uv run rl @ configs/debug/training_modes/sft_external.toml
```

See [docs/training.md](../../docs/training.md#training-modes-rl--opd--sft-via-orchestrator) for what each mode does.
4 changes: 4 additions & 0 deletions configs/debug/training_modes/sft.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ training_mode = "sft"
batch_size = 128
group_size = 4

# Teacher rolls out over plain chat-completions (no tokens); the renderer backfills them.
[orchestrator.renderer]
name = "qwen3"

[orchestrator.train.sampling]
max_completion_tokens = 128

Expand Down
55 changes: 0 additions & 55 deletions configs/debug/training_modes/sft_external.toml

This file was deleted.

4 changes: 4 additions & 0 deletions configs/debug/training_modes/sft_lora.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ training_mode = "sft"
batch_size = 128
group_size = 4

# Teacher rolls out over plain chat-completions (no tokens); the renderer backfills them.
[orchestrator.renderer]
name = "qwen3"

[orchestrator.train.sampling]
max_completion_tokens = 128

Expand Down
42 changes: 42 additions & 0 deletions configs/debug/v1/alphabet_sort.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# v1 port of examples/alphabet_sort/rl.toml — identical except the env block, which loads
# the v1 `alphabet-sort-v1` taskset (multi-turn via a colocated vf.User) instead of the v0
# `primeintellect/alphabet-sort` env. Harness runs on the subprocess runtime.

max_steps = 200
seq_len = 2048

[ckpt] # Checkpoint at the end of training

[model]
name = "Qwen/Qwen3-4B-Instruct-2507"

[wandb]
project = "alphabet-sort"
name = "alphabet-sort"

[trainer.model]
impl = "auto"

[trainer.model.ac]
freq = 1

[trainer.model.lora]
rank = 32
alpha = 64

[trainer.optim]
lr = 1e-5

[orchestrator]
batch_size = 512
group_size = 8

[orchestrator.train.sampling]
max_completion_tokens = 768

[[orchestrator.train.env]]
name = "alphabet-sort"
taskset = { id = "alphabet-sort-v1", min_turns = 3, max_turns = 5, power_per_turn = false }
harness = { id = "default", enable_bash = false, runtime = { type = "subprocess" } }

[inference] # Default inference config
52 changes: 52 additions & 0 deletions configs/debug/v1/hendrycks_sanity.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# v1 analog of examples/hendrycks_sanity/rl.toml — identical config with only
# the env sections swapped to v1 taskset/harness syntax (math-env taskset,
# default harness, subprocess runtime). Submits to slurm by default; pass --no-slurm
# to run locally.

output_dir = "/beegfs/mika/hendrycks-sanity-v1-subprocess"
max_steps = 5000

[wandb]
project = "hendrycks-sanity"
name = "v1"

[deployment]
num_train_gpus = 4
num_infer_gpus = 4

[slurm]
job_name = "v1-subprocess"
partition = "cluster"

[model]
name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

[orchestrator]
batch_size = 512
group_size = 8
seq_len = 8192

[[orchestrator.train.env]]
name = "hendrycks-math"
taskset = { id = "math-env-v1", dataset_name = "mikasenghaas/Sanity-Test-R1D-1.5B", dataset_subset = "default" }
harness = { id = "default", enable_bash = false, runtime = { type = "subprocess" } }

[orchestrator.eval]
interval = 50

[[orchestrator.eval.env]]
name = "aime2024"
taskset = { id = "aime24-v1" }
harness = { id = "default", enable_bash = false, runtime = { type = "subprocess" } }
group_size = 32

[trainer.model]
seq_len = 16384

[inference.model]
max_model_len = 8192

# Model not in MODEL_RENDERER_MAP — opt into DefaultRenderer (apply_chat_template).
[orchestrator.renderer]
name = "default"
reasoning_parser = "think"
86 changes: 86 additions & 0 deletions configs/debug/v1/r2e_gym.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# v1 port of configs/rlm_swe/qwen35_4b.toml — identical except the env blocks, which load
# the v1 `r2e-gym-v1` taskset through the rlm harness on the prime runtime instead of the
# v0 `rlm_swe` composable env. Same model / trainer / inference / orchestrator knobs.

output_dir = "/beegfs/mika/rlm-swe-qwen35-4b"
max_steps = 400
seq_len = 65536

[slurm]
job_name = "rlm-swe-qwen35-4b"
project_dir = "."
pre_run_command = "prime sandbox delete --label rlm-swe-qwen35-4b -y --plain || true"

[deployment]
type = "multi_node"
num_train_nodes = 1
num_infer_nodes = 1
num_infer_replicas = 2

[wandb]
project = "rlm-swe-debug"
name = "qwen35-4b"

[weight_broadcast]
type = "nccl"

[ckpt]
interval = 50
keep_last = 1
resume_step = -1

[model]
name = "Qwen/Qwen3.5-4B"

# --- Trainer ---

[trainer]

[trainer.model]
cp = 4
cp_style = "ulysses"

[trainer.model.ac]
freq = 1

[trainer.model.compile]

# --- Orchestrator ---

[orchestrator]
batch_size = 256
group_size = 8
max_inflight_rollouts = 512
max_off_policy_steps = 16

# Thinking enabled for the Qwen3.5 renderer.
[orchestrator.renderer]
name = "qwen3.5"
enable_thinking = true

[orchestrator.train.sampling]
temperature = 1.0

[[orchestrator.train.env]]
name = "rlm-swe-r2e"
taskset = { id = "r2e-gym-v1" }
harness = { id = "rlm", runtime = { type = "prime", labels = ["rlm-swe-qwen35-4b"] } }

[orchestrator.prime_monitor]

# --- Inference ---

[inference]
gpu_memory_utilization = 0.85
enable_prefix_caching = true

[inference.model]
max_model_len = 65536

[inference.parallel]
dp = 8

# Qwen3.5-4B is a VL model; skip the vision tower for text-only SWE.
# `language_model_only` is a vLLM MultiModalConfig arg (no prime-rl field) → pass via vllm_extra.
[inference.vllm_extra]
language_model_only = true
42 changes: 42 additions & 0 deletions configs/debug/v1/reverse_text.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Debug RL run on the v1 env server (reverse-text starter).
# The orchestrator spawns a v1 EnvServer per env (it never loads the env
# itself), dispatches rollouts by task index, and trains on the returned Traces
# (renderer-tokenized). Light settings for a quick end-to-end smoke.

max_steps = 20
seq_len = 2048

[wandb]
project = "reverse-text"
name = "v1"

[model]
name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT"

[orchestrator]
training_mode = "rl"
batch_size = 128
group_size = 16

[orchestrator.renderer]
name = "qwen3"

[orchestrator.train.sampling]
max_completion_tokens = 128

[[orchestrator.train.env]]
taskset = { id = "reverse-text-v1" }
# reverse-text is a pure-text single-turn env: disable the bash tool (the model answers
# directly) and use the subprocess runtime (no docker).
harness = { id = "default", enable_bash = false, runtime = { type = "subprocess" } }

# No eval block, mirroring examples/reverse_text/rl.toml — this is a train-only
# smoke. Add an [orchestrator.eval] block (with an interval) to exercise eval.

[trainer.optim]
lr = 3e-6

[ckpt]

[inference]
gpu_memory_utilization = 0.5
Loading