diff --git a/configs/debug/v1/multimodal.toml b/configs/debug/v1/multimodal.toml new file mode 100644 index 0000000000..d84fe0253c --- /dev/null +++ b/configs/debug/v1/multimodal.toml @@ -0,0 +1,76 @@ +# v1 port of configs/debug/multimodal.toml — identical except the env block, which loads +# the native v1 `color-codeword-v1` taskset (images delivered by a colocated vf.User) on the +# subprocess runtime, instead of the v0 `color-codeword` env. Exercises native v1 image +# input → renderer → multi_modal_data → Qwen3-VL training end-to-end. + +max_steps = 15 +seq_len = 4096 + +[model] +name = "Qwen/Qwen3-VL-4B-Instruct" + +[model.vlm] +vision_encoder_attr = "model.visual" +language_model_attr = "model.language_model" + +[deployment] +num_train_gpus = 1 +num_infer_gpus = 1 +gpus_per_node = 2 + +[orchestrator] +batch_size = 256 +group_size = 16 +# Image processor is CPU-bound and dominates for VLMs; returns diminish past 4. +pool_size = 4 + +# Step 0 on Qwen3-VL-4B vs color-codeword can be uniform (all-correct or +# all-wrong), so don't enforce zero-advantage dropping or training would crash +# before any progress. +[[orchestrator.post_batch_filters]] +type = "gibberish" + +[[orchestrator.post_batch_filters]] +type = "repetition" + +[[orchestrator.post_batch_filters]] +type = "zero_advantage" +enforce = false + +[orchestrator.train.sampling] +max_completion_tokens = 64 + +[[orchestrator.train.env]] +taskset = { id = "color-codeword-v1", images_per_turn = 1, max_turns = 3, num_examples = 1000, seed = 42 } +harness = { id = "default", enable_bash = false, runtime = { type = "subprocess" } } + +# Default renderer (AutoRendererConfig) resolves Qwen3-VL-4B-Instruct from +# MODEL_RENDERER_MAP to Qwen3VLRenderer at runtime; no explicit name needed. + +[trainer] + +[trainer.model] +optimization_dtype = "bfloat16" +reduce_dtype = "bfloat16" + +[trainer.optim] +lr = 3e-6 + +[inference] + +[inference.model] +# Workaround for vLLM 0.20.1 Qwen3-VL deepstack buffer bug: when num_scheduled_tokens +# (188) gets padded up to the next cudagraph_capture_size (192), the model's +# _set_deepstack_input_embeds sizes the buffer to 188 but forward() runs with 192, +# triggering "Requested more deepstack tokens than available in buffer". Eager mode +# skips the padding so num_input_tokens == num_scheduled_tokens. +enforce_eager = true + +[inference.parallel] +dp = 1 +tp = 1 + +[wandb] +project = "debug" +name = "multimodal-v1" +tags = ["qwen3vl-4b", "color-codeword-v1", "renderer", "v1"] diff --git a/deps/verifiers b/deps/verifiers index 907dac69a4..df6423c08b 160000 --- a/deps/verifiers +++ b/deps/verifiers @@ -1 +1 @@ -Subproject commit 907dac69a46ef8c5c67d71c4444169b05cf569b0 +Subproject commit df6423c08b5fdb740fa24a8a998af56f209d015f diff --git a/pyproject.toml b/pyproject.toml index 5b81b40da9..fc2393ce27 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,6 +102,7 @@ envs = [ "gsm8k-v1", "math-env-v1", "aime24-v1", + "color-codeword-v1", "harnesses", ] disagg = [ @@ -246,6 +247,7 @@ reverse-text-v1 = { path = "deps/verifiers/examples/tasksets/reverse_text_v1", e gsm8k-v1 = { path = "deps/verifiers/examples/tasksets/gsm8k_v1", editable = true } math-env-v1 = { path = "deps/verifiers/examples/tasksets/math_env_v1", editable = true } aime24-v1 = { path = "deps/verifiers/examples/tasksets/aime24_v1", editable = true } +color-codeword-v1 = { path = "deps/verifiers/examples/tasksets/color_codeword_v1", editable = true } harnesses = { path = "deps/verifiers/packages/harnesses", editable = true } renderers = { path = "deps/renderers", editable = true } prime-pydantic-config = { path = "deps/pydantic-config", editable = true } diff --git a/src/prime_rl/orchestrator/orchestrator.py b/src/prime_rl/orchestrator/orchestrator.py index 94fd192784..f144b6422c 100644 --- a/src/prime_rl/orchestrator/orchestrator.py +++ b/src/prime_rl/orchestrator/orchestrator.py @@ -66,6 +66,7 @@ compute_teacher_logprobs, get_weight_dir, intercept_vf_logging, + mm_token_type_id_map, save_rollouts, set_default_executor, setup_student_inference_pool, @@ -215,11 +216,9 @@ async def setup(self) -> None: self.renderer, self.student_inference = await setup_student_inference_pool( config=config, tokenizer=self.tokenizer ) - self.mm_token_type_ids_mapping = ( - getattr(self.renderer, "mm_token_type_id_map", None) if self.renderer is not None else None - ) - if self.mm_token_type_ids_mapping == {}: - self.mm_token_type_ids_mapping = None + # The renderer lives in the env server, so derive the VLM type-id map from the + # renderer config directly (None for text models). + self.mm_token_type_ids_mapping = mm_token_type_id_map(config) if config.teacher is not None: get_logger().info( diff --git a/src/prime_rl/orchestrator/train_sink.py b/src/prime_rl/orchestrator/train_sink.py index 589c24524f..db264c50f0 100644 --- a/src/prime_rl/orchestrator/train_sink.py +++ b/src/prime_rl/orchestrator/train_sink.py @@ -149,7 +149,12 @@ async def process_rollout(self, rollout: TrainRollout) -> None: return if any(turn.tokens is None for turn in rollout.trace.trajectory): await asyncio.to_thread(backfill_rollout_tokens, rollout.trace, self.tokenizer) - samples = await asyncio.to_thread(trace_to_samples, rollout.trace, env_name=rollout.env_name) + samples = await asyncio.to_thread( + trace_to_samples, + rollout.trace, + env_name=rollout.env_name, + mm_token_type_ids_mapping=self.mm_token_type_ids_mapping, + ) rollout.samples = samples or [] def process_group(self, group_id: uuid.UUID) -> None: diff --git a/src/prime_rl/orchestrator/trajectories.py b/src/prime_rl/orchestrator/trajectories.py index 333054ae1b..21ca524e4c 100644 --- a/src/prime_rl/orchestrator/trajectories.py +++ b/src/prime_rl/orchestrator/trajectories.py @@ -14,6 +14,7 @@ from verifiers.v1.clients.openai import message_to_wire from prime_rl.transport import TrainingSample +from prime_rl.transport.types import EncodedTensor from prime_rl.utils.chat_template import ( common_prefix_len, deserialize_tool_calls, @@ -50,7 +51,57 @@ def backfill_rollout_tokens(trace: vf.Trace, tokenizer) -> None: ) -def trace_to_samples(trace: vf.Trace, *, env_name: str = "") -> list[TrainingSample]: +def _decode_wire_tensor(wt: vf.WireTensor): + import base64 + + import numpy as np + import torch + + arr = np.frombuffer(base64.b64decode(wt.data), dtype=np.dtype(wt.dtype)).reshape(wt.shape) + return torch.from_numpy(arr.copy()) + + +def _pack_mm_kwargs(mm_list: list[vf.MMData]) -> dict[str, EncodedTensor] | None: + """Union each turn's *new* images into model-agnostic `mm_kwargs`: concat each + HF-processor kwarg (e.g. `pixel_values`, `image_grid_thw`) in turn order. The model's + `forward` signature is the schema, so image/video keys don't clash. + + The stitched ids carry each image's placeholder tokens once (in the turn it's + introduced), so we contribute each image once too. A turn's `multi_modal_data` may be + *cumulative* (the renderer re-rendered the whole prompt → every image so far, native v1) + or *delta* (only the turn's new images, the v0 bridge); a turn is cumulative iff its + hashes restate everything taken so far, so we take only the appended tail. Identical + images in different turns (e.g. two squares of the same color) keep distinct slots — + matched by position, not deduped by hash.""" + import torch + + per_kwarg: dict[str, list] = {} + taken: dict[str, list] = {} # modality -> hashes contributed so far, in order + for mm in mm_list: + for modality, items in mm.mm_items.items(): + hashes = list(mm.mm_hashes.get(modality) or [None] * len(items)) + acc = taken.setdefault(modality, []) + if None not in hashes and hashes[: len(acc)] == acc: + start = len(acc) # cumulative turn: skip the restated prefix + acc[:] = hashes + else: + start = 0 # delta turn: all images are new + acc.extend(hashes) + for item in items[start:]: + for key, wt in item.items(): + per_kwarg.setdefault(key, []).append(_decode_wire_tensor(wt)) + if not per_kwarg: + return None + out: dict[str, EncodedTensor] = {} + for key, tensors in per_kwarg.items(): + arr = torch.cat(tensors, dim=0).contiguous().numpy() + out[key] = EncodedTensor(dtype=str(arr.dtype), shape=list(arr.shape), data=arr.tobytes()) + return out + + +def trace_to_samples( + trace: vf.Trace, *, env_name: str = "", mm_token_type_ids_mapping: dict[int, int] | None = None +) -> list[TrainingSample]: """Convert a v1 `Trace` into `TrainingSample`s — one per branch. Stitch each branch's turns into one token sequence: the branch's first-turn @@ -91,6 +142,13 @@ def trace_to_samples(trace: vf.Trace, *, env_name: str = "") -> list[TrainingSam if not completion_ids: continue + # Multimodal: union each turn's image/video tensors into mm_kwargs, and tag every + # token with its modality (image-placeholder vs text) via the renderer's map. + mm = [t.tokens.multi_modal_data for t in turns if t.tokens and t.tokens.multi_modal_data] + mm_kwargs = _pack_mm_kwargs(mm) if mm else None + mm_token_type_ids = None + if mm_kwargs is not None and mm_token_type_ids_mapping: + mm_token_type_ids = [mm_token_type_ids_mapping.get(tid, 0) for tid in prompt_ids + completion_ids] samples.append( TrainingSample( prompt_ids=prompt_ids, @@ -102,6 +160,8 @@ def trace_to_samples(trace: vf.Trace, *, env_name: str = "") -> list[TrainingSam teacher_logprobs=None, advantage=None, env_name=env_name, + mm_kwargs=mm_kwargs, + mm_token_type_ids=mm_token_type_ids, ) ) if not samples: diff --git a/src/prime_rl/orchestrator/utils.py b/src/prime_rl/orchestrator/utils.py index 5b0ff6ae1f..6b728ab57b 100644 --- a/src/prime_rl/orchestrator/utils.py +++ b/src/prime_rl/orchestrator/utils.py @@ -42,6 +42,19 @@ async def setup_student_inference_pool(*, config: OrchestratorConfig, tokenizer) return None, inference_pool +def mm_token_type_id_map(config: OrchestratorConfig) -> dict[int, int] | None: + """The image/video placeholder-token → type-id map the VLM trainer needs to build + `mm_token_type_ids` (M-RoPE). Built transiently from the renderer config — the + orchestrator keeps no renderer of its own, and a VLM always has one configured + (`vlm_requires_renderer`). None for a text model.""" + if not config.student.model.is_vlm or config.renderer is None: + return None + from renderers import create_renderer_pool + + pool = create_renderer_pool(config.student.model.name, config.renderer, size=1) + return getattr(pool, "mm_token_type_id_map", None) or None + + def get_model_completion_len(output: vf.Trace) -> int: """Sum of model-generated completion tokens across all turns (excludes environment-injected tokens between turns).""" diff --git a/uv.lock b/uv.lock index 166aedb371..a7b8b1b12f 100644 --- a/uv.lock +++ b/uv.lock @@ -568,6 +568,17 @@ requires-dist = [ { name = "verifiers", specifier = ">=0.1.10" }, ] +[[package]] +name = "color-codeword-v1" +version = "0.1.0" +source = { editable = "deps/verifiers/examples/tasksets/color_codeword_v1" } +dependencies = [ + { name = "pillow", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] + +[package.metadata] +requires-dist = [{ name = "pillow" }] + [[package]] name = "comm" version = "0.2.3" @@ -3599,6 +3610,7 @@ envs = [ { name = "alphabet-sort", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "code-env", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "color-codeword", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "color-codeword-v1", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "deepdive", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "general-agent", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, { name = "gpqa", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, @@ -3671,6 +3683,7 @@ requires-dist = [ { name = "beartype", specifier = ">=0.21.0" }, { name = "code-env", marker = "extra == 'envs'", editable = "deps/research-environments/environments/code_env" }, { name = "color-codeword", marker = "extra == 'envs'", editable = "deps/research-environments/environments/color_codeword" }, + { name = "color-codeword-v1", marker = "extra == 'envs'", editable = "deps/verifiers/examples/tasksets/color_codeword_v1" }, { name = "datasets", specifier = ">=4.0.0" }, { name = "deep-ep", marker = "platform_machine == 'x86_64' and extra == 'disagg'", url = "https://github.com/PrimeIntellect-ai/prime-rl/releases/download/v0.5.0/deep_ep-1.2.1+29d31c0-cp312-cp312-linux_x86_64.whl" }, { name = "deep-gemm", marker = "platform_machine == 'x86_64' and extra == 'disagg'", url = "https://github.com/PrimeIntellect-ai/prime-rl/releases/download/v0.5.0/deep_gemm-2.5.0+891d57b-cp312-cp312-linux_x86_64.whl" },