PrimeIntellect-ai · hallerite · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026 · Jun 10, 2026
diff --git a/configs/ci/integration/reverse_text_rl_opd/start.toml b/configs/ci/integration/reverse_text_rl_opd/start.toml
@@ -1,6 +1,6 @@
-# Smoke test for the RL-entrypoint OPD (on-policy distillation) training mode.
-# The student inference deployment is launched by the rl entrypoint on GPU 0;
-# the teacher inference server is started externally by the test fixture (see
+# Smoke test for the RL-entrypoint opd algorithm (on-policy distillation).
+# The policy inference deployment is launched by the rl entrypoint on GPU 0;
+# the frozen reference server is started externally by the test fixture (see
 # tests/integration/test_reverse_text_rl_opd.py) on the same GPU. Trainer
 # takes GPU 1. Training config mirrors `reverse_text/start.toml`.
 
@@ -15,10 +15,16 @@ project = "reverse-text-ci"
 name = "ci-rl-opd"
 
 [orchestrator]
-training_mode = "opd"
 batch_size = 128
 group_size = 16
 
+[orchestrator.algo]
+name = "opd"
+
+[orchestrator.algo.model]
+name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
+base_url = ["http://localhost:8001/v1"]
+
 [orchestrator.renderer]
 name = "qwen3"
 
@@ -38,12 +44,6 @@ max_completion_tokens = 128
 [[orchestrator.eval.env]]
 id = "reverse-text"
 
-[orchestrator.teacher.model]
-name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
-
-[orchestrator.teacher.client]
-base_url = ["http://localhost:8001/v1"]
-
 [trainer.optim]
 lr = 3e-6
 

diff --git a/configs/ci/integration/reverse_text_rl_sft/start.toml b/configs/ci/integration/reverse_text_rl_sft/start.toml
@@ -1,8 +1,8 @@
-# Smoke test for the RL-entrypoint SFT (on-policy hard distillation) training
-# mode. The student inference deployment is launched by the rl entrypoint on
-# GPU 0; the teacher inference server is started externally by the test
-# fixture (see tests/integration/test_reverse_text_rl_sft.py) on the same GPU.
-# Trainer takes GPU 1. Training config mirrors `reverse_text/start.toml`.
+# Smoke test for the RL-entrypoint sft_distill algorithm (hard distillation).
+# The policy inference deployment is launched by the rl entrypoint on GPU 0;
+# the frozen sampling server is started externally by the test fixture (see
+# tests/integration/test_reverse_text_rl_sft.py) on the same GPU. Trainer
+# takes GPU 1. Training config mirrors `reverse_text/start.toml`.
 
 max_steps = 5
 seq_len = 2048
@@ -15,10 +15,16 @@ project = "reverse-text-ci"
 name = "ci-rl-sft"
 
 [orchestrator]
-training_mode = "sft"
 batch_size = 128
 group_size = 16
 
+[orchestrator.algo]
+name = "sft_distill"
+
+[orchestrator.algo.model]
+name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
+base_url = ["http://localhost:8001/v1"]
+
 [orchestrator.train.sampling]
 max_completion_tokens = 128
 
@@ -35,12 +41,6 @@ max_completion_tokens = 128
 [[orchestrator.eval.env]]
 id = "reverse-text"
 
-[orchestrator.teacher.model]
-name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
-
-[orchestrator.teacher.client]
-base_url = ["http://localhost:8001/v1"]
-
 [trainer.optim]
 lr = 3e-6
 

diff --git a/configs/debug/algorithms/README.md b/configs/debug/algorithms/README.md
@@ -0,0 +1,61 @@
+# Algorithm — Debug Configs
+
+Minimal end-to-end configs for the algorithm presets against bundled verifiers envs, using `PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT` as the policy.
+
+| Config | Algorithm | Frozen model | Notes |
+|---|---|---|---|
+| `grpo.toml` | `grpo` | none | |
+| `opd.toml` | `opd` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | |
+| `opd_lora.toml` | `opd` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | trains a LoRA adapter (rank 8) |
+| `sft_distill.toml` | `sft_distill` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | |
+| `sft_distill_lora.toml` | `sft_distill` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | trains a LoRA adapter (rank 8) |
+| `sft_distill_external.toml` | `sft_distill` | PI inference (`openai/gpt-5-mini`) | external OAI endpoint; no local server |
+| `self_distill.toml` | `self_distill` | none (`model = "policy"`) | SDFT against the live policy; demo from reverse-text's `answer` field |
+| `echo.toml` | `echo` | none | multi-turn `alphabet-sort`; CE on observation tokens |
+| `mixed_grpo_opd.toml` | `grpo` + `opd` (per env) | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | two envs, one run; heterogeneous batches (with/without `ref_logprobs`) |
+
+The policy inference server is auto-launched on GPU 0 at `http://localhost:8000/v1` with `gpu_memory_utilization=0.5`. The local frozen model (used by `opd*.toml` and `sft_distill.toml` / `sft_distill_lora.toml`) is **not** auto-launched — start it manually on GPU 1.
+
+Frozen models are declared inline on the algorithm — `[orchestrator.algo.model]` with `name` + `base_url` — and prime-rl never hosts them; only the trainable policy's server is managed by the `rl` entrypoint.
+
+## Start the local frozen model
+
+Needed for `opd*.toml`, `sft_distill.toml` / `sft_distill_lora.toml`, and `mixed_grpo_opd.toml`:
+
+```bash
+CUDA_VISIBLE_DEVICES=1 uv run inference \
+  --model.name PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL \
+  --server.port 8001 \
+  --gpu-memory-utilization 0.5 \
+  --model.enforce-eager
+```
+
+## Run the debug configs
+
+```bash
+# GRPO (no frozen model)
+uv run rl @ configs/debug/algorithms/grpo.toml
+
+# OPD (needs the frozen model on port 8001)
+uv run rl @ configs/debug/algorithms/opd.toml
+uv run rl @ configs/debug/algorithms/opd_lora.toml
+
+# SFT distillation (needs the frozen model on port 8001)
+uv run rl @ configs/debug/algorithms/sft_distill.toml
+uv run rl @ configs/debug/algorithms/sft_distill_lora.toml
+
+# SFT distillation from openai/gpt-5-mini via PI inference
+# (requires PRIME_API_KEY + PRIME_TEAM_ID in env; no local frozen model needed)
+uv run rl @ configs/debug/algorithms/sft_distill_external.toml
+
+# Self-distillation against the live policy (no frozen model)
+uv run rl @ configs/debug/algorithms/self_distill.toml
+
+# ECHO (no frozen model; multi-turn env)
+uv run rl @ configs/debug/algorithms/echo.toml
+
+# Mixed per-env algorithms: GRPO + OPD in one run (needs the frozen model on port 8001)
+uv run rl @ configs/debug/algorithms/mixed_grpo_opd.toml
+```
+
+See [docs/algorithms.md](../../../docs/algorithms.md) for what each algorithm does and how to compose custom ones.
diff --git a/configs/debug/algorithms/echo.toml b/configs/debug/algorithms/echo.toml
@@ -0,0 +1,47 @@
+# ECHO on the multi-turn alphabet-sort env (bundled with verifiers): GRPO on
+# action tokens + weighted CE on the env's observation tokens.
+#   uv run rl @ configs/debug/algorithms/echo.toml
+
+max_steps = 20
+seq_len = 4096
+
+[model]
+name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT"
+
+[wandb]
+project = "algorithms-debug"
+name = "debug-echo"
+
+[orchestrator]
+batch_size = 32
+group_size = 4
+
+# Assembled (no preset name): presets are atomic, and we set echo's lambda.
+[orchestrator.algo.advantage]
+type = "echo"
+observation_weight = 0.1
+
+[[orchestrator.train.env]]
+id = "alphabet-sort"
+args = { min_turns = 3, max_turns = 5, power_per_turn = false }
+
+[orchestrator.train.sampling]
+max_completion_tokens = 512
+
+# ECHO learns from observation tokens even when the GRPO advantage collapses
+# to zero — keep zero-advantage rollouts in the batch.
+[[orchestrator.post_batch_filters]]
+type = "zero_advantage"
+enforce = false
+
+# Qwen3 finetune with the standard PI template patch; always re-emits prior
+# <think> blocks, matched by the qwen3 renderer's preserve_all_thinking.
+[orchestrator.renderer]
+name = "qwen3"
+preserve_all_thinking = true
+
+[trainer.optim]
+lr = 1e-6
+
+[inference]
+gpu_memory_utilization = 0.5
diff --git a/configs/debug/training_modes/rl.toml → configs/debug/algorithms/grpo.toml b/configs/debug/training_modes/rl.toml → configs/debug/algorithms/grpo.toml
@@ -9,10 +9,12 @@ project = "reverse-text-debug"
 name = "debug-rl"
 
 [orchestrator]
-training_mode = "rl"
 batch_size = 128
 group_size = 16
 
+[orchestrator.algo]
+name = "grpo"
+
 [orchestrator.renderer]
 name = "qwen3"
 

diff --git a/configs/debug/algorithms/mixed_grpo_opd.toml b/configs/debug/algorithms/mixed_grpo_opd.toml
@@ -0,0 +1,63 @@
+# Mixed per-env algorithms in one run: a GRPO env and an OPD env, both on
+# reverse-text. Exercises heterogeneous train batches — OPD samples ship
+# ref_logprobs, GRPO samples don't, and both pack into the same micro batches.
+# Start the frozen reference server first (on a separate GPU):
+#   CUDA_VISIBLE_DEVICES=1 uv run inference \
+#     --model.name PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL \
+#     --server.port 8001 --gpu-memory-utilization 0.5 --model.enforce-eager
+# Then:
+#   uv run rl @ configs/debug/algorithms/mixed_grpo_opd.toml
+
+max_steps = 20
+seq_len = 2048
+
+[model]
+name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT"
+
+[wandb]
+project = "algorithms-debug"
+name = "debug-mixed-grpo-opd"
+
+[orchestrator]
+batch_size = 128
+group_size = 16
+
+[orchestrator.algo]
+name = "grpo"
+
+[orchestrator.renderer]
+name = "qwen3"
+
+[orchestrator.train.sampling]
+max_completion_tokens = 128
+
+[[orchestrator.train.env]]
+id = "reverse-text"
+name = "reverse-text-grpo"
+
+[[orchestrator.train.env]]
+id = "reverse-text"
+name = "reverse-text-opd"
+
+[orchestrator.train.env.algo]
+name = "opd"
+
+[orchestrator.train.env.algo.model]
+name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
+base_url = ["http://localhost:8001/v1"]
+
+[orchestrator.eval]
+interval = 5
+num_examples = 128
+
+[orchestrator.eval.sampling]
+max_completion_tokens = 128
+
+[[orchestrator.eval.env]]
+id = "reverse-text"
+
+[trainer.optim]
+lr = 3e-6
+
+[inference]
+gpu_memory_utilization = 0.5
diff --git a/configs/debug/training_modes/opd.toml → configs/debug/algorithms/opd.toml b/configs/debug/training_modes/opd.toml → configs/debug/algorithms/opd.toml
@@ -1,9 +1,9 @@
-# Start the teacher inference server first (on a separate GPU):
+# Start the frozen reference server first (on a separate GPU):
 #   CUDA_VISIBLE_DEVICES=1 uv run inference \
 #     --model.name PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL \
 #     --server.port 8001 --gpu-memory-utilization 0.5 --model.enforce-eager
 # Then:
-#   uv run rl @ configs/debug/training_modes/opd.toml
+#   uv run rl @ configs/debug/algorithms/opd.toml
 
 max_steps = 20
 seq_len = 2048
@@ -16,10 +16,16 @@ project = "reverse-text-debug"
 name = "debug-opd"
 
 [orchestrator]
-training_mode = "opd"
 batch_size = 128
 group_size = 16
 
+[orchestrator.algo]
+name = "opd"
+
+[orchestrator.algo.model]
+name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
+base_url = ["http://localhost:8001/v1"]
+
 [orchestrator.renderer]
 name = "qwen3"
 
@@ -39,12 +45,6 @@ max_completion_tokens = 128
 [[orchestrator.eval.env]]
 id = "reverse-text"
 
-[orchestrator.teacher.model]
-name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
-
-[orchestrator.teacher.client]
-base_url = ["http://localhost:8001/v1"]
-
 [trainer.optim]
 lr = 3e-6
 

diff --git a/configs/debug/training_modes/opd_lora.toml → configs/debug/algorithms/opd_lora.toml b/configs/debug/training_modes/opd_lora.toml → configs/debug/algorithms/opd_lora.toml
@@ -1,9 +1,9 @@
-# Start the teacher inference server first (on a separate GPU):
+# Start the frozen reference server first (on a separate GPU):
 #   CUDA_VISIBLE_DEVICES=1 uv run inference \
 #     --model.name PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL \
 #     --server.port 8001 --gpu-memory-utilization 0.5 --model.enforce-eager
 # Then:
-#   uv run rl @ configs/debug/training_modes/opd_lora.toml
+#   uv run rl @ configs/debug/algorithms/opd_lora.toml
 
 max_steps = 20
 seq_len = 2048
@@ -16,10 +16,16 @@ project = "reverse-text-debug"
 name = "debug-opd-lora"
 
 [orchestrator]
-training_mode = "opd"
 batch_size = 128
 group_size = 16
 
+[orchestrator.algo]
+name = "opd"
+
+[orchestrator.algo.model]
+name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
+base_url = ["http://localhost:8001/v1"]
+
 [orchestrator.renderer]
 name = "qwen3"
 
@@ -39,12 +45,6 @@ max_completion_tokens = 128
 [[orchestrator.eval.env]]
 id = "reverse-text"
 
-[orchestrator.teacher.model]
-name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
-
-[orchestrator.teacher.client]
-base_url = ["http://localhost:8001/v1"]
-
 [trainer.optim]
 lr = 1e-4