PrimeIntellect-ai · mikasenghaas · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026
diff --git a/.github/workflows/cpu_tests.yaml b/.github/workflows/cpu_tests.yaml
@@ -77,8 +77,11 @@ jobs:
           from prime_rl.utils.config import BaseConfig, find_package_resource, rgetattr, rsetattr
           from prime_rl.utils.validation import validate_shared_ckpt_config
 
+          # `verifiers` (+ its `datasets` dep) is a declared slim dep: the v1 config types
+          # (EnvConfig, Task, ...) extend verifiers.v1, which is pure-pydantic and pulls no
+          # GPU/ML deps. We still forbid the actual heavy training deps below.
           forbidden = ["torch", "transformers", "vllm", "wandb", "ring_flash_attn",
-                       "verifiers", "prime", "datasets", "liger_kernel", "loguru"]
+                       "prime", "liger_kernel", "loguru"]
           leaked = [m for m in forbidden if m in sys.modules]
           if leaked:
               raise SystemExit(f"slim install leaked heavy deps into sys.modules: {leaked}")

diff --git a/.gitmodules b/.gitmodules
@@ -1,6 +1,3 @@
-[submodule "verifiers"]
-	path = deps/verifiers
-	url = git@github.com:PrimeIntellect-ai/verifiers.git
 [submodule "renderers"]
 	path = deps/renderers
 	url = git@github.com:PrimeIntellect-ai/renderers.git
@@ -13,3 +10,7 @@
 [submodule "pydantic-config"]
 	path = deps/pydantic-config
 	url = https://github.com/PrimeIntellect-ai/pydantic-config
+[submodule "deps/verifiers"]
+	path = deps/verifiers
+	url = git@github.com:PrimeIntellect-ai/verifiers.git
+	branch = feat/nano-as-v1
diff --git a/configs/ci/integration/reverse_text_rl_sft/start.toml b/configs/ci/integration/reverse_text_rl_sft/start.toml
@@ -19,6 +19,10 @@ training_mode = "sft"
 batch_size = 128
 group_size = 16
 
+# Teacher rolls out over plain chat-completions (no tokens); the renderer backfills them.
+[orchestrator.renderer]
+name = "qwen3"
+
 [orchestrator.train.sampling]
 max_completion_tokens = 128
 

diff --git a/configs/debug/reverse_text_v1.toml b/configs/debug/reverse_text_v1.toml
diff --git a/configs/debug/training_modes/README.md b/configs/debug/training_modes/README.md
@@ -9,9 +9,8 @@ Minimal end-to-end configs for the three training modes (`rl` / `opd` / `sft`) a
 | `opd_lora.toml` | `opd` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | trains a LoRA adapter (rank 8) |
 | `sft.toml` | `sft` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | |
 | `sft_lora.toml` | `sft` | local vLLM (`Qwen3-0.6B-Reverse-Text-RL`) | trains a LoRA adapter (rank 8) |
-| `sft_external.toml` | `sft` | PI inference (`openai/gpt-5-mini`) | external OAI endpoint; no local teacher |
 
-The student inference server is auto-launched on GPU 0 at `http://localhost:8000/v1` with `gpu_memory_utilization=0.5`. The local teacher (used by everything except `rl.toml` and `sft_external.toml`) is **not** auto-launched — start it manually on GPU 1.
+The student inference server is auto-launched on GPU 0 at `http://localhost:8000/v1` with `gpu_memory_utilization=0.5`. The local teacher (used by everything except `rl.toml`) is **not** auto-launched — start it manually on GPU 1.
 
 ## Start the local teacher
 
@@ -38,10 +37,6 @@ uv run rl @ configs/debug/training_modes/opd_lora.toml
 # SFT hard distill (needs teacher on port 8001)
 uv run rl @ configs/debug/training_modes/sft.toml
 uv run rl @ configs/debug/training_modes/sft_lora.toml
-
-# SFT hard distill from openai/gpt-5-mini via PI inference
-# (requires PRIME_API_KEY + PRIME_TEAM_ID in env; no local teacher needed)
-uv run rl @ configs/debug/training_modes/sft_external.toml
 ```
 
 See [docs/training.md](../../docs/training.md#training-modes-rl--opd--sft-via-orchestrator) for what each mode does.
diff --git a/configs/debug/training_modes/sft.toml b/configs/debug/training_modes/sft.toml
@@ -20,6 +20,10 @@ training_mode = "sft"
 batch_size = 128
 group_size = 4
 
+# Teacher rolls out over plain chat-completions (no tokens); the renderer backfills them.
+[orchestrator.renderer]
+name = "qwen3"
+
 [orchestrator.train.sampling]
 max_completion_tokens = 128
 

diff --git a/configs/debug/training_modes/sft_external.toml b/configs/debug/training_modes/sft_external.toml
diff --git a/configs/debug/training_modes/sft_lora.toml b/configs/debug/training_modes/sft_lora.toml
@@ -20,6 +20,10 @@ training_mode = "sft"
 batch_size = 128
 group_size = 4
 
+# Teacher rolls out over plain chat-completions (no tokens); the renderer backfills them.
+[orchestrator.renderer]
+name = "qwen3"
+
 [orchestrator.train.sampling]
 max_completion_tokens = 128
 

diff --git a/configs/debug/v1/alphabet_sort.toml b/configs/debug/v1/alphabet_sort.toml
@@ -0,0 +1,42 @@
+# v1 port of examples/alphabet_sort/rl.toml — identical except the env block, which loads
+# the v1 `alphabet-sort-v1` taskset (multi-turn via a colocated vf.User) instead of the v0
+# `primeintellect/alphabet-sort` env. Harness runs on the subprocess runtime.
+
+max_steps = 200
+seq_len = 2048
+
+[ckpt] # Checkpoint at the end of training
+
+[model]
+name = "Qwen/Qwen3-4B-Instruct-2507"
+
+[wandb]
+project = "alphabet-sort"
+name = "alphabet-sort"
+
+[trainer.model]
+impl = "auto"
+
+[trainer.model.ac]
+freq = 1
+
+[trainer.model.lora]
+rank = 32
+alpha = 64
+
+[trainer.optim]
+lr = 1e-5
+
+[orchestrator]
+batch_size = 512
+group_size = 8
+
+[orchestrator.train.sampling]
+max_completion_tokens = 768
+
+[[orchestrator.train.env]]
+name = "alphabet-sort"
+taskset = { id = "alphabet-sort-v1", min_turns = 3, max_turns = 5, power_per_turn = false }
+harness = { id = "default", enable_bash = false, runtime = { type = "subprocess" } }
+
+[inference] # Default inference config
diff --git a/configs/debug/v1/hendrycks_sanity.toml b/configs/debug/v1/hendrycks_sanity.toml
@@ -0,0 +1,52 @@
+# v1 analog of examples/hendrycks_sanity/rl.toml — identical config with only
+# the env sections swapped to v1 taskset/harness syntax (math-env taskset,
+# default harness, subprocess runtime). Submits to slurm by default; pass --no-slurm
+# to run locally.
+
+output_dir = "/beegfs/mika/hendrycks-sanity-v1-subprocess"
+max_steps = 5000
+
+[wandb]
+project = "hendrycks-sanity"
+name = "v1"
+
+[deployment]
+num_train_gpus = 4
+num_infer_gpus = 4
+
+[slurm]
+job_name = "v1-subprocess"
+partition = "cluster"
+
+[model]
+name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+
+[orchestrator]
+batch_size = 512
+group_size = 8
+seq_len = 8192
+
+[[orchestrator.train.env]]
+name = "hendrycks-math"
+taskset = { id = "math-env-v1", dataset_name = "mikasenghaas/Sanity-Test-R1D-1.5B", dataset_subset = "default" }
+harness = { id = "default", enable_bash = false, runtime = { type = "subprocess" } }
+
+[orchestrator.eval]
+interval = 50
+
+[[orchestrator.eval.env]]
+name = "aime2024"
+taskset = { id = "aime24-v1" }
+harness = { id = "default", enable_bash = false, runtime = { type = "subprocess" } }
+group_size = 32
+
+[trainer.model]
+seq_len = 16384
+
+[inference.model]
+max_model_len = 8192
+
+# Model not in MODEL_RENDERER_MAP — opt into DefaultRenderer (apply_chat_template).
+[orchestrator.renderer]
+name = "default"
+reasoning_parser = "think"
diff --git a/configs/debug/v1/r2e_gym.toml b/configs/debug/v1/r2e_gym.toml
@@ -0,0 +1,86 @@
+# v1 port of configs/rlm_swe/qwen35_4b.toml — identical except the env blocks, which load
+# the v1 `r2e-gym-v1` taskset through the rlm harness on the prime runtime instead of the
+# v0 `rlm_swe` composable env. Same model / trainer / inference / orchestrator knobs.
+
+output_dir = "/beegfs/mika/rlm-swe-qwen35-4b"
+max_steps = 400
+seq_len = 65536
+
+[slurm]
+job_name = "rlm-swe-qwen35-4b"
+project_dir = "."
+pre_run_command = "prime sandbox delete --label rlm-swe-qwen35-4b -y --plain || true"
+
+[deployment]
+type = "multi_node"
+num_train_nodes = 1
+num_infer_nodes = 1
+num_infer_replicas = 2
+
+[wandb]
+project = "rlm-swe-debug"
+name = "qwen35-4b"
+
+[weight_broadcast]
+type = "nccl"
+
+[ckpt]
+interval = 50
+keep_last = 1
+resume_step = -1
+
+[model]
+name = "Qwen/Qwen3.5-4B"
+
+# --- Trainer ---
+
+[trainer]
+
+[trainer.model]
+cp = 4
+cp_style = "ulysses"
+
+[trainer.model.ac]
+freq = 1
+
+[trainer.model.compile]
+
+# --- Orchestrator ---
+
+[orchestrator]
+batch_size = 256
+group_size = 8
+max_inflight_rollouts = 512
+max_off_policy_steps = 16
+
+# Thinking enabled for the Qwen3.5 renderer.
+[orchestrator.renderer]
+name = "qwen3.5"
+enable_thinking = true
+
+[orchestrator.train.sampling]
+temperature = 1.0
+
+[[orchestrator.train.env]]
+name = "rlm-swe-r2e"
+taskset = { id = "r2e-gym-v1" }
+harness = { id = "rlm", runtime = { type = "prime", labels = ["rlm-swe-qwen35-4b"] } }
+
+[orchestrator.prime_monitor]
+
+# --- Inference ---
+
+[inference]
+gpu_memory_utilization = 0.85
+enable_prefix_caching = true
+
+[inference.model]
+max_model_len = 65536
+
+[inference.parallel]
+dp = 8
+
+# Qwen3.5-4B is a VL model; skip the vision tower for text-only SWE.
+# `language_model_only` is a vLLM MultiModalConfig arg (no prime-rl field) → pass via vllm_extra.
+[inference.vllm_extra]
+language_model_only = true
diff --git a/configs/debug/v1/reverse_text.toml b/configs/debug/v1/reverse_text.toml
@@ -0,0 +1,42 @@
+# Debug RL run on the v1 env server (reverse-text starter).
+# The orchestrator spawns a v1 EnvServer per env (it never loads the env
+# itself), dispatches rollouts by task index, and trains on the returned Traces
+# (renderer-tokenized). Light settings for a quick end-to-end smoke.
+
+max_steps = 20
+seq_len = 2048
+
+[wandb]
+project = "reverse-text"
+name = "v1"
+
+[model]
+name = "PrimeIntellect/Qwen3-0.6B-Reverse-Text-SFT"
+
+[orchestrator]
+training_mode = "rl"
+batch_size = 128
+group_size = 16
+
+[orchestrator.renderer]
+name = "qwen3"
+
+[orchestrator.train.sampling]
+max_completion_tokens = 128
+
+[[orchestrator.train.env]]
+taskset = { id = "reverse-text-v1" }
+# reverse-text is a pure-text single-turn env: disable the bash tool (the model answers
+# directly) and use the subprocess runtime (no docker).
+harness = { id = "default", enable_bash = false, runtime = { type = "subprocess" } }
+
+# No eval block, mirroring examples/reverse_text/rl.toml — this is a train-only
+# smoke. Add an [orchestrator.eval] block (with an interval) to exercise eval.
+
+[trainer.optim]
+lr = 3e-6
+
+[ckpt]
+
+[inference]
+gpu_memory_utilization = 0.5