Skip to content
Open
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
f35e3a8
feat: algorithm abstraction — sampling/scoring/loss presets (grpo, op…
hallerite Jun 9, 2026
5893d71
refactor: kill the teacher concept — model registry + runtime Algorit…
hallerite Jun 9, 2026
fc79ae1
refactor: unify advantage and token scorers into one advantage-strate…
hallerite Jun 9, 2026
3c3c3d1
Merge remote-tracking branch 'origin/main' into feat/algorithm-abstra…
hallerite Jun 10, 2026
54864f4
fix(trainer): keep ref_logprobs position-aligned when packing mixed bins
hallerite Jun 10, 2026
fc79651
feat(trainer): assert per-token array alignment after packing
hallerite Jun 10, 2026
9b41d1b
refactor: rename config key algorithm -> algo, loss_core -> loss_type
hallerite Jun 10, 2026
207ad13
refactor(configs): make [orchestrator.model] canonical for the policy
hallerite Jun 10, 2026
5f91d7d
feat(configs): mixed grpo+opd debug config
hallerite Jun 10, 2026
8eb5c33
docs: state the role principle — roles are algorithm-local labels ove…
hallerite Jun 10, 2026
fceac30
feat(orchestrator): inline algorithm-owned model references, drop the…
hallerite Jun 10, 2026
96beac3
refactor(configs): flatten FrozenModelConfig to ClientConfig + name
hallerite Jun 10, 2026
c3a7d6f
chore(configs): drop the POLICY_MODEL constant
hallerite Jun 10, 2026
abbcb8b
refactor(transport): LossType IntEnum for loss type scalars
hallerite Jun 10, 2026
d21c9f9
feat(orchestrator): per-token advantages from custom advantage strate…
hallerite Jun 10, 2026
51fa6b1
refactor(orchestrator): split algorithms.py into the algo/ package
hallerite Jun 10, 2026
5083441
chore(orchestrator): rename setup_frozen_pool/owned_pools to connect_…
hallerite Jun 10, 2026
06d50ed
feat(trainer): loss partition -> sum of three weighted components
hallerite Jun 10, 2026
4a743e3
refactor(configs): presets as data deltas, shorthand folding on raw i…
hallerite Jun 10, 2026
34bdd15
feat(orchestrator): named algorithm classes own assign/score; declare…
hallerite Jun 10, 2026
f8f73cb
refactor(orchestrator): algorithms take (policy_pool, renderer); toke…
hallerite Jun 10, 2026
3eba9f4
refactor: merge loss routing into the advantage; split sampling into …
hallerite Jun 11, 2026
d8b7e89
refactor(orchestrator): OPD ships no scalar advantage, like OPSD
hallerite Jun 11, 2026
fdb6b87
chore: parsimony pass over the algorithm abstraction
hallerite Jun 11, 2026
d322bcc
Merge remote-tracking branch 'origin/main' into feat/algorithm-abstra…
hallerite Jun 11, 2026
cfc1041
feat(orchestrator): MaxRL advantage strategy
hallerite Jun 11, 2026
618b605
refactor(configs): presets are atomic — select whole or assemble your…
hallerite Jun 11, 2026
ba05022
Merge branch 'feat/algorithm-abstraction' into feat/maxrl-advantage
hallerite Jun 11, 2026
64304b9
docs: module docstring matches atomic presets
hallerite Jun 11, 2026
3ff860c
Merge branch 'feat/algorithm-abstraction' into feat/maxrl-advantage
hallerite Jun 11, 2026
1d7e76c
feat(orchestrator): echo trains tool-response tokens by default
hallerite Jun 11, 2026
eeaf4f0
Merge branch 'feat/algorithm-abstraction' into feat/maxrl-advantage
hallerite Jun 11, 2026
1389227
test: shorthand-assembly fixture needs a renderer for tool-mode echo
hallerite Jun 11, 2026
b9af98c
Merge branch 'feat/algorithm-abstraction' into feat/maxrl-advantage
hallerite Jun 11, 2026
9157076
fix(orchestrator): accept attribution as dict or RenderedTokens
hallerite Jun 11, 2026
1395858
Merge branch 'feat/algorithm-abstraction' into feat/maxrl-advantage
hallerite Jun 11, 2026
8a7b8a6
refactor(configs): flatten the policy config — orchestrator.model.nam…
hallerite Jun 11, 2026
26cdbd3
Merge branch 'feat/algorithm-abstraction' into feat/maxrl-advantage
hallerite Jun 11, 2026
3126d46
fix: flat-policy leftovers — elastic client section, setup mocks, docs
hallerite Jun 11, 2026
a718d59
Merge branch 'feat/algorithm-abstraction' into feat/maxrl-advantage
hallerite Jun 11, 2026
262baf2
chore: section-style algo config in max_rl debug toml
hallerite Jun 11, 2026
509d6a3
feat(echo): per-role echo weights + user-supplied token filter
hallerite Jun 11, 2026
e55f067
Merge branch 'feat/echo-selection' into feat/algorithm-abstraction
hallerite Jun 12, 2026
8e7e51f
feat(algo): advantage types are the algorithm names; preset layer del…
hallerite Jun 12, 2026
48ac7ff
docs: scrub stale algorithm names and preset vocabulary
hallerite Jun 12, 2026
f5be6f4
fix(trainer): anchor the loss to the graph when every component is empty
hallerite Jun 12, 2026
0606f12
fix(trainer): namespace ref_kl loss metrics; harden batch preparation
hallerite Jun 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions configs/ci/integration/reverse_text_rl_opd/start.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Smoke test for the RL-entrypoint OPD (on-policy distillation) training mode.
# The student inference deployment is launched by the rl entrypoint on GPU 0;
# the teacher inference server is started externally by the test fixture (see
# Smoke test for the RL-entrypoint opd algorithm (on-policy distillation).
# The policy inference deployment is launched by the rl entrypoint on GPU 0;
# the frozen reference server is started externally by the test fixture (see
# tests/integration/test_reverse_text_rl_opd.py) on the same GPU. Trainer
# takes GPU 1. Training config mirrors `reverse_text/start.toml`.

Expand All @@ -15,10 +15,16 @@ project = "reverse-text-ci"
name = "ci-rl-opd"

[orchestrator]
training_mode = "opd"
batch_size = 128
group_size = 16

[orchestrator.algo]
name = "opd"

[orchestrator.algo.model]
name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
base_url = ["http://localhost:8001/v1"]

[orchestrator.renderer]
name = "qwen3"

Expand All @@ -38,12 +44,6 @@ max_completion_tokens = 128
[[orchestrator.eval.env]]
id = "reverse-text"

[orchestrator.teacher.model]
name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"

[orchestrator.teacher.client]
base_url = ["http://localhost:8001/v1"]

[trainer.optim]
lr = 3e-6

Expand Down
24 changes: 12 additions & 12 deletions configs/ci/integration/reverse_text_rl_sft/start.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Smoke test for the RL-entrypoint SFT (on-policy hard distillation) training
# mode. The student inference deployment is launched by the rl entrypoint on
# GPU 0; the teacher inference server is started externally by the test
# fixture (see tests/integration/test_reverse_text_rl_sft.py) on the same GPU.
# Trainer takes GPU 1. Training config mirrors `reverse_text/start.toml`.
# Smoke test for the RL-entrypoint sft_distill algorithm (hard distillation).
# The policy inference deployment is launched by the rl entrypoint on GPU 0;
# the frozen sampling server is started externally by the test fixture (see
# tests/integration/test_reverse_text_rl_sft.py) on the same GPU. Trainer
# takes GPU 1. Training config mirrors `reverse_text/start.toml`.

max_steps = 5
seq_len = 2048
Expand All @@ -15,10 +15,16 @@ project = "reverse-text-ci"
name = "ci-rl-sft"

[orchestrator]
training_mode = "sft"
batch_size = 128
group_size = 16

[orchestrator.algo]
name = "sft_distill"

[orchestrator.algo.model]
name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
base_url = ["http://localhost:8001/v1"]

[orchestrator.train.sampling]
max_completion_tokens = 128

Expand All @@ -35,12 +41,6 @@ max_completion_tokens = 128
[[orchestrator.eval.env]]
id = "reverse-text"

[orchestrator.teacher.model]
name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"

[orchestrator.teacher.client]
base_url = ["http://localhost:8001/v1"]

[trainer.optim]
lr = 3e-6

Expand Down
61 changes: 61 additions & 0 deletions configs/debug/algorithms/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Algorithm — Debug Configs

Minimal end-to-end configs for the algorithm presets against bundled verifiers envs, using `PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT` as the policy.

| Config | Algorithm | Frozen model | Notes |
|---|---|---|---|
| `grpo.toml` | `grpo` | none | |
| `opd.toml` | `opd` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | |
| `opd_lora.toml` | `opd` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | trains a LoRA adapter (rank 8) |
| `sft_distill.toml` | `sft_distill` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | |
| `sft_distill_lora.toml` | `sft_distill` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | trains a LoRA adapter (rank 8) |
| `sft_distill_external.toml` | `sft_distill` | PI inference (`openai/gpt-5-mini`) | external OAI endpoint; no local server |
| `self_distill.toml` | `self_distill` | none (`model = "policy"`) | SDFT against the live policy; demo from reverse-text's `answer` field |
| `echo.toml` | `echo` | none | multi-turn `alphabet-sort`; CE on observation tokens |
| `mixed_grpo_opd.toml` | `grpo` + `opd` (per env) | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | two envs, one run; heterogeneous batches (with/without `ref_logprobs`) |

The policy inference server is auto-launched on GPU 0 at `http://localhost:8000/v1` with `gpu_memory_utilization=0.5`. The local frozen model (used by `opd*.toml` and `sft_distill.toml` / `sft_distill_lora.toml`) is **not** auto-launched — start it manually on GPU 1.

Frozen models are declared inline on the algorithm — `[orchestrator.algo.model]` with `name` + `base_url` — and prime-rl never hosts them; only the trainable policy's server is managed by the `rl` entrypoint.

## Start the local frozen model

Needed for `opd*.toml`, `sft_distill.toml` / `sft_distill_lora.toml`, and `mixed_grpo_opd.toml`:

```bash
CUDA_VISIBLE_DEVICES=1 uv run inference \
--model.name PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL \
--server.port 8001 \
--gpu-memory-utilization 0.5 \
--model.enforce-eager
```

## Run the debug configs

```bash
# GRPO (no frozen model)
uv run rl @ configs/debug/algorithms/grpo.toml

# OPD (needs the frozen model on port 8001)
uv run rl @ configs/debug/algorithms/opd.toml
uv run rl @ configs/debug/algorithms/opd_lora.toml

# SFT distillation (needs the frozen model on port 8001)
uv run rl @ configs/debug/algorithms/sft_distill.toml
uv run rl @ configs/debug/algorithms/sft_distill_lora.toml

# SFT distillation from openai/gpt-5-mini via PI inference
# (requires PRIME_API_KEY + PRIME_TEAM_ID in env; no local frozen model needed)
uv run rl @ configs/debug/algorithms/sft_distill_external.toml

# Self-distillation against the live policy (no frozen model)
uv run rl @ configs/debug/algorithms/self_distill.toml

# ECHO (no frozen model; multi-turn env)
uv run rl @ configs/debug/algorithms/echo.toml

# Mixed per-env algorithms: GRPO + OPD in one run (needs the frozen model on port 8001)
uv run rl @ configs/debug/algorithms/mixed_grpo_opd.toml
```

See [docs/algorithms.md](../../../docs/algorithms.md) for what each algorithm does and how to compose custom ones.
47 changes: 47 additions & 0 deletions configs/debug/algorithms/echo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# ECHO on the multi-turn alphabet-sort env (bundled with verifiers): GRPO on
# action tokens + weighted CE on the env's observation tokens.
# uv run rl @ configs/debug/algorithms/echo.toml

max_steps = 20
seq_len = 4096

[model]
name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT"

[wandb]
project = "algorithms-debug"
name = "debug-echo"

[orchestrator]
batch_size = 32
group_size = 4

# Assembled (no preset name): presets are atomic, and we set echo's lambda.
[orchestrator.algo.advantage]
type = "echo"
observation_weight = 0.1

[[orchestrator.train.env]]
id = "alphabet-sort"
args = { min_turns = 3, max_turns = 5, power_per_turn = false }

[orchestrator.train.sampling]
max_completion_tokens = 512

# ECHO learns from observation tokens even when the GRPO advantage collapses
# to zero — keep zero-advantage rollouts in the batch.
[[orchestrator.post_batch_filters]]
type = "zero_advantage"
enforce = false

# Qwen3 finetune with the standard PI template patch; always re-emits prior
# <think> blocks, matched by the qwen3 renderer's preserve_all_thinking.
[orchestrator.renderer]
name = "qwen3"
preserve_all_thinking = true

[trainer.optim]
lr = 1e-6

[inference]
gpu_memory_utilization = 0.5
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@ project = "reverse-text-debug"
name = "debug-rl"

[orchestrator]
training_mode = "rl"
batch_size = 128
group_size = 16

[orchestrator.algo]
name = "grpo"

[orchestrator.renderer]
name = "qwen3"

Expand Down
63 changes: 63 additions & 0 deletions configs/debug/algorithms/mixed_grpo_opd.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Mixed per-env algorithms in one run: a GRPO env and an OPD env, both on
# reverse-text. Exercises heterogeneous train batches — OPD samples ship
# ref_logprobs, GRPO samples don't, and both pack into the same micro batches.
# Start the frozen reference server first (on a separate GPU):
# CUDA_VISIBLE_DEVICES=1 uv run inference \
# --model.name PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL \
# --server.port 8001 --gpu-memory-utilization 0.5 --model.enforce-eager
# Then:
# uv run rl @ configs/debug/algorithms/mixed_grpo_opd.toml

max_steps = 20
seq_len = 2048

[model]
name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT"

[wandb]
project = "algorithms-debug"
name = "debug-mixed-grpo-opd"

[orchestrator]
batch_size = 128
group_size = 16

[orchestrator.algo]
name = "grpo"

[orchestrator.renderer]
name = "qwen3"

[orchestrator.train.sampling]
max_completion_tokens = 128

[[orchestrator.train.env]]
id = "reverse-text"
name = "reverse-text-grpo"

[[orchestrator.train.env]]
id = "reverse-text"
name = "reverse-text-opd"

[orchestrator.train.env.algo]
name = "opd"

[orchestrator.train.env.algo.model]
name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
base_url = ["http://localhost:8001/v1"]

[orchestrator.eval]
interval = 5
num_examples = 128

[orchestrator.eval.sampling]
max_completion_tokens = 128

[[orchestrator.eval.env]]
id = "reverse-text"

[trainer.optim]
lr = 3e-6

[inference]
gpu_memory_utilization = 0.5
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Start the teacher inference server first (on a separate GPU):
# Start the frozen reference server first (on a separate GPU):
# CUDA_VISIBLE_DEVICES=1 uv run inference \
# --model.name PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL \
# --server.port 8001 --gpu-memory-utilization 0.5 --model.enforce-eager
# Then:
# uv run rl @ configs/debug/training_modes/opd.toml
# uv run rl @ configs/debug/algorithms/opd.toml

max_steps = 20
seq_len = 2048
Expand All @@ -16,10 +16,16 @@ project = "reverse-text-debug"
name = "debug-opd"

[orchestrator]
training_mode = "opd"
batch_size = 128
group_size = 16

[orchestrator.algo]
name = "opd"

[orchestrator.algo.model]
name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
base_url = ["http://localhost:8001/v1"]

[orchestrator.renderer]
name = "qwen3"

Expand All @@ -39,12 +45,6 @@ max_completion_tokens = 128
[[orchestrator.eval.env]]
id = "reverse-text"

[orchestrator.teacher.model]
name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"

[orchestrator.teacher.client]
base_url = ["http://localhost:8001/v1"]

[trainer.optim]
lr = 3e-6

Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Start the teacher inference server first (on a separate GPU):
# Start the frozen reference server first (on a separate GPU):
# CUDA_VISIBLE_DEVICES=1 uv run inference \
# --model.name PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL \
# --server.port 8001 --gpu-memory-utilization 0.5 --model.enforce-eager
# Then:
# uv run rl @ configs/debug/training_modes/opd_lora.toml
# uv run rl @ configs/debug/algorithms/opd_lora.toml

max_steps = 20
seq_len = 2048
Expand All @@ -16,10 +16,16 @@ project = "reverse-text-debug"
name = "debug-opd-lora"

[orchestrator]
training_mode = "opd"
batch_size = 128
group_size = 16

[orchestrator.algo]
name = "opd"

[orchestrator.algo.model]
name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
base_url = ["http://localhost:8001/v1"]

[orchestrator.renderer]
name = "qwen3"

Expand All @@ -39,12 +45,6 @@ max_completion_tokens = 128
[[orchestrator.eval.env]]
id = "reverse-text"

[orchestrator.teacher.model]
name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"

[orchestrator.teacher.client]
base_url = ["http://localhost:8001/v1"]

[trainer.optim]
lr = 1e-4

Expand Down
Loading
Loading