PrimeIntellect-ai · hallerite · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026 · Jun 10, 2026
diff --git a/configs/ci/integration/reverse_text_rl_opd/start.toml b/configs/ci/integration/reverse_text_rl_opd/start.toml
@@ -1,6 +1,6 @@
-# Smoke test for the RL-entrypoint OPD (on-policy distillation) training mode.
-# The student inference deployment is launched by the rl entrypoint on GPU 0;
-# the teacher inference server is started externally by the test fixture (see
+# Smoke test for the RL-entrypoint opd algorithm (on-policy distillation).
+# The policy inference deployment is launched by the rl entrypoint on GPU 0;
+# the frozen reference server is started externally by the test fixture (see
 # tests/integration/test_reverse_text_rl_opd.py) on the same GPU. Trainer
 # takes GPU 1. Training config mirrors `reverse_text/start.toml`.
 
@@ -15,7 +15,7 @@ project = "reverse-text-ci"
 name = "ci-rl-opd"
 
 [orchestrator]
-training_mode = "opd"
+algo = { name = "opd", model = "reverse-text-rl" }
 batch_size = 128
 group_size = 16
 
@@ -38,10 +38,10 @@ max_completion_tokens = 128
 [[orchestrator.eval.env]]
 id = "reverse-text"
 
-[orchestrator.teacher.model]
+[orchestrator.models.reverse-text-rl.model]
 name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
 
-[orchestrator.teacher.client]
+[orchestrator.models.reverse-text-rl.client]
 base_url = ["http://localhost:8001/v1"]
 
 [trainer.optim]

diff --git a/configs/ci/integration/reverse_text_rl_sft/start.toml b/configs/ci/integration/reverse_text_rl_sft/start.toml
@@ -1,8 +1,8 @@
-# Smoke test for the RL-entrypoint SFT (on-policy hard distillation) training
-# mode. The student inference deployment is launched by the rl entrypoint on
-# GPU 0; the teacher inference server is started externally by the test
-# fixture (see tests/integration/test_reverse_text_rl_sft.py) on the same GPU.
-# Trainer takes GPU 1. Training config mirrors `reverse_text/start.toml`.
+# Smoke test for the RL-entrypoint sft_distill algorithm (hard distillation).
+# The policy inference deployment is launched by the rl entrypoint on GPU 0;
+# the frozen sampling server is started externally by the test fixture (see
+# tests/integration/test_reverse_text_rl_sft.py) on the same GPU. Trainer
+# takes GPU 1. Training config mirrors `reverse_text/start.toml`.
 
 max_steps = 5
 seq_len = 2048
@@ -15,7 +15,7 @@ project = "reverse-text-ci"
 name = "ci-rl-sft"
 
 [orchestrator]
-training_mode = "sft"
+algo = { name = "sft_distill", model = "reverse-text-rl" }
 batch_size = 128
 group_size = 16
 
@@ -35,10 +35,10 @@ max_completion_tokens = 128
 [[orchestrator.eval.env]]
 id = "reverse-text"
 
-[orchestrator.teacher.model]
+[orchestrator.models.reverse-text-rl.model]
 name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
 
-[orchestrator.teacher.client]
+[orchestrator.models.reverse-text-rl.client]
 base_url = ["http://localhost:8001/v1"]
 
 [trainer.optim]

diff --git a/configs/debug/algorithms/README.md b/configs/debug/algorithms/README.md
@@ -0,0 +1,57 @@
+# Algorithm — Debug Configs
+
+Minimal end-to-end configs for the algorithm presets against bundled verifiers envs, using `PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT` as the policy.
+
+| Config | Algorithm | Frozen model | Notes |
+|---|---|---|---|
+| `grpo.toml` | `grpo` | none | |
+| `opd.toml` | `opd` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | |
+| `opd_lora.toml` | `opd` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | trains a LoRA adapter (rank 8) |
+| `sft_distill.toml` | `sft_distill` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | |
+| `sft_distill_lora.toml` | `sft_distill` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | trains a LoRA adapter (rank 8) |
+| `sft_distill_external.toml` | `sft_distill` | PI inference (`openai/gpt-5-mini`) | external OAI endpoint; no local server |
+| `self_distill.toml` | `self_distill` | none (`model = "policy"`) | SDFT against the live policy; demo from reverse-text's `answer` field |
+| `echo.toml` | `echo` | none | multi-turn `alphabet-sort`; CE on observation tokens |
+
+The policy inference server is auto-launched on GPU 0 at `http://localhost:8000/v1` with `gpu_memory_utilization=0.5`. The local frozen model (used by `opd*.toml` and `sft_distill.toml` / `sft_distill_lora.toml`) is **not** auto-launched — start it manually on GPU 1.
+
+Frozen models are plain `[orchestrator.models.<key>]` entries; the algorithm points at them by key (`algo = { name = "opd", model = "reverse-text-rl" }`). There is no dedicated teacher slot — the same entry can serve any number of envs' algorithms.
+
+## Start the local frozen model
+
+Needed for `opd*.toml` and `sft_distill.toml` / `sft_distill_lora.toml`:
+
+```bash
+CUDA_VISIBLE_DEVICES=1 uv run inference \
+  --model.name PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL \
+  --server.port 8001 \
+  --gpu-memory-utilization 0.5 \
+  --model.enforce-eager
+```
+
+## Run the debug configs
+
+```bash
+# GRPO (no frozen model)
+uv run rl @ configs/debug/algorithms/grpo.toml
+
+# OPD (needs the frozen model on port 8001)
+uv run rl @ configs/debug/algorithms/opd.toml
+uv run rl @ configs/debug/algorithms/opd_lora.toml
+
+# SFT distillation (needs the frozen model on port 8001)
+uv run rl @ configs/debug/algorithms/sft_distill.toml
+uv run rl @ configs/debug/algorithms/sft_distill_lora.toml
+
+# SFT distillation from openai/gpt-5-mini via PI inference
+# (requires PRIME_API_KEY + PRIME_TEAM_ID in env; no local frozen model needed)
+uv run rl @ configs/debug/algorithms/sft_distill_external.toml
+
+# Self-distillation against the live policy (no frozen model)
+uv run rl @ configs/debug/algorithms/self_distill.toml
+
+# ECHO (no frozen model; multi-turn env)
+uv run rl @ configs/debug/algorithms/echo.toml
+```
+
+See [docs/algorithms.md](../../../docs/algorithms.md) for what each algorithm does and how to compose custom ones.
diff --git a/configs/debug/algorithms/echo.toml b/configs/debug/algorithms/echo.toml
@@ -0,0 +1,48 @@
+# ECHO on the multi-turn alphabet-sort env (bundled with verifiers): GRPO on
+# action tokens + weighted CE on the env's observation tokens.
+#   uv run rl @ configs/debug/algorithms/echo.toml
+
+max_steps = 20
+seq_len = 4096
+
+[model]
+name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT"
+
+[wandb]
+project = "algorithms-debug"
+name = "debug-echo"
+
+[orchestrator]
+batch_size = 32
+group_size = 4
+
+[orchestrator.algo]
+name = "echo"
+
+[orchestrator.algo.loss]
+observation_weight = 0.1
+
+[[orchestrator.train.env]]
+id = "alphabet-sort"
+args = { min_turns = 3, max_turns = 5, power_per_turn = false }
+
+[orchestrator.train.sampling]
+max_completion_tokens = 512
+
+# ECHO learns from observation tokens even when the GRPO advantage collapses
+# to zero — keep zero-advantage rollouts in the batch.
+[[orchestrator.post_batch_filters]]
+type = "zero_advantage"
+enforce = false
+
+# Qwen3 finetune with the standard PI template patch; always re-emits prior
+# <think> blocks, matched by the qwen3 renderer's preserve_all_thinking.
+[orchestrator.renderer]
+name = "qwen3"
+preserve_all_thinking = true
+
+[trainer.optim]
+lr = 1e-6
+
+[inference]
+gpu_memory_utilization = 0.5
diff --git a/configs/debug/training_modes/rl.toml → configs/debug/algorithms/grpo.toml b/configs/debug/training_modes/rl.toml → configs/debug/algorithms/grpo.toml
@@ -9,7 +9,7 @@ project = "reverse-text-debug"
 name = "debug-rl"
 
 [orchestrator]
-training_mode = "rl"
+algo = { name = "grpo" }
 batch_size = 128
 group_size = 16
 

diff --git a/configs/debug/training_modes/opd.toml → configs/debug/algorithms/opd.toml b/configs/debug/training_modes/opd.toml → configs/debug/algorithms/opd.toml
@@ -1,9 +1,9 @@
-# Start the teacher inference server first (on a separate GPU):
+# Start the frozen reference server first (on a separate GPU):
 #   CUDA_VISIBLE_DEVICES=1 uv run inference \
 #     --model.name PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL \
 #     --server.port 8001 --gpu-memory-utilization 0.5 --model.enforce-eager
 # Then:
-#   uv run rl @ configs/debug/training_modes/opd.toml
+#   uv run rl @ configs/debug/algorithms/opd.toml
 
 max_steps = 20
 seq_len = 2048
@@ -16,7 +16,7 @@ project = "reverse-text-debug"
 name = "debug-opd"
 
 [orchestrator]
-training_mode = "opd"
+algo = { name = "opd", model = "reverse-text-rl" }
 batch_size = 128
 group_size = 16
 
@@ -39,10 +39,10 @@ max_completion_tokens = 128
 [[orchestrator.eval.env]]
 id = "reverse-text"
 
-[orchestrator.teacher.model]
+[orchestrator.models.reverse-text-rl.model]
 name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
 
-[orchestrator.teacher.client]
+[orchestrator.models.reverse-text-rl.client]
 base_url = ["http://localhost:8001/v1"]
 
 [trainer.optim]

diff --git a/configs/debug/training_modes/opd_lora.toml → configs/debug/algorithms/opd_lora.toml b/configs/debug/training_modes/opd_lora.toml → configs/debug/algorithms/opd_lora.toml
@@ -1,9 +1,9 @@
-# Start the teacher inference server first (on a separate GPU):
+# Start the frozen reference server first (on a separate GPU):
 #   CUDA_VISIBLE_DEVICES=1 uv run inference \
 #     --model.name PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL \
 #     --server.port 8001 --gpu-memory-utilization 0.5 --model.enforce-eager
 # Then:
-#   uv run rl @ configs/debug/training_modes/opd_lora.toml
+#   uv run rl @ configs/debug/algorithms/opd_lora.toml
 
 max_steps = 20
 seq_len = 2048
@@ -16,7 +16,7 @@ project = "reverse-text-debug"
 name = "debug-opd-lora"
 
 [orchestrator]
-training_mode = "opd"
+algo = { name = "opd", model = "reverse-text-rl" }
 batch_size = 128
 group_size = 16
 
@@ -39,10 +39,10 @@ max_completion_tokens = 128
 [[orchestrator.eval.env]]
 id = "reverse-text"
 
-[orchestrator.teacher.model]
+[orchestrator.models.reverse-text-rl.model]
 name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
 
-[orchestrator.teacher.client]
+[orchestrator.models.reverse-text-rl.client]
 base_url = ["http://localhost:8001/v1"]
 
 [trainer.optim]

diff --git a/configs/debug/algorithms/self_distill.toml b/configs/debug/algorithms/self_distill.toml
@@ -0,0 +1,48 @@
+# Self-distillation (SDFT, https://arxiv.org/abs/2601.19897) against the live
+# policy itself: the reference for each completion is the current model
+# conditioned on the expert demonstration — no extra deployment needed.
+# reverse-text carries the demonstration in its top-level `answer` field.
+#   uv run rl @ configs/debug/algorithms/self_distill.toml
+
+max_steps = 20
+seq_len = 2048
+
+[model]
+name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT"
+
+[wandb]
+project = "algorithms-debug"
+name = "debug-self-distill"
+
+[orchestrator]
+batch_size = 32
+group_size = 1
+
+[orchestrator.algo]
+name = "self_distill"
+advantage = { type = "demo_ref_kl", model = "policy", demo_key = "answer" }
+
+[orchestrator.renderer]
+name = "qwen3"
+
+[orchestrator.train.sampling]
+max_completion_tokens = 128
+
+[[orchestrator.train.env]]
+id = "reverse-text"
+
+[orchestrator.eval]
+interval = 1
+num_examples = 128
+
+[orchestrator.eval.sampling]
+max_completion_tokens = 128
+
+[[orchestrator.eval.env]]
+id = "reverse-text"
+
+[trainer.optim]
+lr = 3e-6
+
+[inference]
+gpu_memory_utilization = 0.5
diff --git a/configs/debug/training_modes/sft.toml → configs/debug/algorithms/sft_distill.toml b/configs/debug/training_modes/sft.toml → configs/debug/algorithms/sft_distill.toml
@@ -1,9 +1,9 @@
-# Start the teacher inference server first (on a separate GPU):
+# Start the frozen reference server first (on a separate GPU):
 #   CUDA_VISIBLE_DEVICES=1 uv run inference \
 #     --model.name PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL \
 #     --server.port 8001 --gpu-memory-utilization 0.5 --model.enforce-eager
 # Then:
-#   uv run rl @ configs/debug/training_modes/sft.toml
+#   uv run rl @ configs/debug/algorithms/sft_distill.toml
 
 max_steps = 20
 seq_len = 2048
@@ -16,7 +16,7 @@ project = "reverse-text-debug"
 name = "debug-sft"
 
 [orchestrator]
-training_mode = "sft"
+algo = { name = "sft_distill", model = "reverse-text-rl" }
 batch_size = 128
 group_size = 4
 
@@ -36,10 +36,10 @@ max_completion_tokens = 128
 [[orchestrator.eval.env]]
 id = "reverse-text"
 
-[orchestrator.teacher.model]
+[orchestrator.models.reverse-text-rl.model]
 name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-RL"
 
-[orchestrator.teacher.client]
+[orchestrator.models.reverse-text-rl.client]
 base_url = ["http://localhost:8001/v1"]
 
 [trainer.optim]

diff --git a/...gs/debug/training_modes/sft_external.toml → ...ebug/algorithms/sft_distill_external.toml b/...gs/debug/training_modes/sft_external.toml → ...ebug/algorithms/sft_distill_external.toml
@@ -1,8 +1,8 @@
-# SFT from openai/gpt-5-mini via PI inference.
+# SFT distillation from openai/gpt-5-mini via PI inference.
 # Requires PRIME_API_KEY + PRIME_TEAM_ID in the environment.
 #
 # Run with:
-#   uv run rl @ configs/debug/training_modes/sft_external.toml
+#   uv run rl @ configs/debug/algorithms/sft_distill_external.toml
 
 max_steps = 20
 seq_len = 2048
@@ -15,7 +15,7 @@ project = "reverse-text-debug"
 name = "debug-sft-external"
 
 [orchestrator]
-training_mode = "sft"
+algo = { name = "sft_distill", model = "gpt-5-mini" }
 batch_size = 128
 group_size = 4
 
@@ -36,14 +36,14 @@ max_completion_tokens = 128
 [[orchestrator.eval.env]]
 id = "reverse-text"
 
-[orchestrator.teacher.model]
+[orchestrator.models.gpt-5-mini.model]
 name = "openai/gpt-5-mini"
 
-[orchestrator.teacher.client]
+[orchestrator.models.gpt-5-mini.client]
 base_url = ["https://api.pinference.ai/api/v1"]
 api_key_var = "PRIME_API_KEY"
 
-[orchestrator.teacher.client.headers_from_env]
+[orchestrator.models.gpt-5-mini.client.headers_from_env]
 X-Prime-Team-ID = "PRIME_TEAM_ID"
 
 [trainer.optim]