Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
f32df98
feat(orchestrator): reconstruct mm pixels at training-sample build
eligotts May 27, 2026
1e89b7a
chore: bump renderers + verifiers pins to ephemeral-mm-pixels
eligotts May 27, 2026
7e67a08
chore(memory): cap native threads on orchestrator + env-worker spawn
eligotts May 29, 2026
c019044
feat(orchestrator): bound + trim multimodal pixel materialization; bu…
eligotts May 29, 2026
2bc548f
fix(orchestrator): resolve image-offload dir to absolute; bump deps/v…
codex May 29, 2026
9a57ecc
fix(orchestrator): free per-step pixel mm_kwargs; drop dead offload-d…
codex May 29, 2026
fa757d9
chore: bump deps/renderers + deps/verifiers to ephemeral-mm heads
eligotts May 30, 2026
9558f95
feat(mm): MMRefs transport + trainer-side deferred materialization + …
eligotts May 30, 2026
b0e1e4d
feat(mm): orchestrator ships mm_refs + canonical run-scoped offload/s…
eligotts May 30, 2026
be11a90
feat(mm): vLLM mmfile feature reader
eligotts May 30, 2026
c0ef88b
feat(mm): default deferred materialization + feature offload on (VLM-…
eligotts May 30, 2026
545c8c9
fix(mm): build trainer renderer whenever configured, not gated on mod…
eligotts May 31, 2026
64b33b0
fix(docker): pin runtime base to python:3.12-slim-bookworm (glibc 2.36)
eligotts May 31, 2026
dbdbeea
feat(mm): features-only artifact eviction (30m TTL) + last-use mtime
eligotts Jun 1, 2026
d859ac6
fix(qwen-vlm): raise a clear error on image token/feature mismatch
eligotts Jun 1, 2026
1b24bd7
test(orchestrator): assert step-back delta sample is self-contained
eligotts Jun 1, 2026
a2e89e3
feat(trainer): pack multimodal samples into microbatches
eligotts Jun 3, 2026
5bd07f7
feat(transport): default micro-batches to ZMQ with multi-node binding…
eligotts Jun 4, 2026
765fbb5
fix(monitor): inline offloaded images for platform sample upload
eligotts Jun 4, 2026
d9bac3b
chore(transport): raise default publish_grace_ms 100 -> 1000
eligotts Jun 4, 2026
f7a7fbc
fix(transport): remove generation-bound publish timeout from micro-ba…
eligotts Jun 4, 2026
52f4bd8
fix(monitor): stream multimodal sample uploads from disk
eligotts Jun 5, 2026
468a09a
chore(deps): bump renderers + verifiers to merged-main submodule commits
eligotts Jun 5, 2026
2f210f5
Merge remote-tracking branch 'origin/main' into feat/ephemeral-mm-pixels
eligotts Jun 5, 2026
e699871
chore(deps): bump verifiers to c33261b9
eligotts Jun 5, 2026
987d5ec
feat: vLLM serving accepts + materializes mmraw raw-image refs
eligotts Jun 9, 2026
7414cd7
fix(orchestrator): default train env num_workers to 32
eligotts Jun 9, 2026
4456197
fix(monitor): stream sample parquet to disk incrementally, drop VLM gate
eligotts Jun 9, 2026
b35bce8
Revert "fix(orchestrator): default train env num_workers to 32"
hubert-marek Jun 9, 2026
bcadf8c
fix(orchestrator): drop per-step prompt arrays from buffered rollouts…
hubert-marek Jun 10, 2026
35456a7
chore: bump renderers to 462149b (mmraw preprocessor_config hub-downl…
hubert-marek Jun 10, 2026
b101d22
explicit del and malloc (#2757)
hubert-marek Jun 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion Dockerfile.cuda
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,11 @@ ARG TARGETARCH
COPY scripts/docker-arm64-post-install.sh /app/scripts/docker-arm64-post-install.sh
RUN if [ "$TARGETARCH" = "arm64" ]; then /app/scripts/docker-arm64-post-install.sh; fi

FROM python:3.12-slim
# Pin Debian 12 (bookworm, glibc 2.36) — the bare `python:3.12-slim` tag tracks
# Debian stable, which moved to trixie (glibc ~2.41) and broke the runtime FLA
# TileLang nvcc JIT (`bits/mathcalls.h: cospi/sinpi noexcept` conflict vs the
# mounted CUDA host headers). bookworm's glibc matches the ubuntu22.04 builder.
FROM python:3.12-slim-bookworm

RUN apt-get update && apt-get install -y \
--no-install-recommends \
Expand Down
2 changes: 1 addition & 1 deletion deps/verifiers
Submodule verifiers updated 120 files
22 changes: 22 additions & 0 deletions packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,9 @@ def _preserve_mito_renderer(self, handler: SerializerFunctionWrapHandler) -> dic
output_dir: Path = Path("outputs/run_default")
"""Directory to write outputs to — checkpoints, weights, rollouts, and logs are written as subdirectories. Should be a persistent directory with enough disk space and unique per experiment running on a single node."""

mm_artifact_ttl_seconds: float = 1800.0
"""TTL (seconds) for offloaded multimodal ``mm_features`` artifacts under ``output_dir/assets/mm_features``. Once per step the orchestrator deletes feature files older than this. Features ONLY: source images under ``assets/images`` are never swept (they are terminal browser output with no regeneration path and are kept for the whole run as the recoverable source). Features are a regenerable cache (trainer rebuilds pixels from the image; env-worker rewrites missing features on demand), so over-eviction just forces a reprocess. The TTL only needs to exceed the write→vLLM-admit window (seconds), so minutes leave a large safety margin against racing in-flight reads. Defaults to 30 minutes."""

tasks_per_minute: int | None = Field(None, ge=1)
"""Rate limit per environment worker, in tasks per minute. Recommended for sandbox-backed environments to prevent sandbox-not-ready errors during autoscaling. With multiple workers, the effective total rate is ``workers × this value``. None disables rate limiting."""

Expand Down Expand Up @@ -619,6 +622,9 @@ def _preserve_mito_renderer(self, handler: SerializerFunctionWrapHandler) -> dic
max_off_policy_steps: int = Field(8, ge=0)
"""Maximum policies allowed to generate a single rollout. Rollouts generated more than ``max_off_policy_steps`` ahead of training are discarded. Higher values yield better throughput at the cost of off-policy noise."""

defer_mm_materialization: bool = True
"""Defer multimodal pixel materialization to the trainer. When True, the orchestrator ships lightweight image references (``mm_refs``) instead of materializing pixels and shipping heavy ``mm_kwargs``. Must match the trainer's setting. A no-op for text-only runs; forced off for SFT."""

bench: bool = False
"""Benchmark mode. Sets ``max_steps`` to 5 and disables W&B."""

Expand Down Expand Up @@ -760,6 +766,9 @@ def _force_no_renderer_for_sft(self):
validators below so they see the corrected value."""
if self.training_mode == "sft":
self.renderer = None
# SFT has no renderer, so it can't defer materialization; keep the
# default-on flag from tripping the renderer-required validator.
self.defer_mm_materialization = False
return self

@model_validator(mode="after")
Expand Down Expand Up @@ -836,6 +845,19 @@ def validate_renderer_auto_resolves(self):
f"client entirely (MITO)."
)

@model_validator(mode="after")
def validate_defer_mm_materialization(self):
"""Deferred materialization needs a renderer so the descriptor it ships
in ``mm_refs`` is reproducible by the trainer's identical renderer."""
# Only VLM runs emit mm_refs; text-only runs never do, so default-on is
# a harmless no-op for them even if the renderer is opted out.
if self.defer_mm_materialization and self.renderer is None and self.student.model.vlm is not None:
raise ValueError(
"orchestrator.defer_mm_materialization requires a renderer so the trainer can "
"materialize pixels identically from the shipped image references."
)
return self

@model_validator(mode="after")
def resolve_batching(self):
has_rollout_batch = self.batch_size is not None
Expand Down
24 changes: 21 additions & 3 deletions packages/prime-rl-configs/src/prime_rl/configs/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,16 +249,34 @@ class FileSystemTransportConfig(BaseTransportConfig):


class ZMQTransportConfig(BaseTransportConfig):
"""
ZMQ binds on all local interfaces and connects to ``host`` (or ``MASTER_ADDR`` when unset).
Base ``port`` is used for training batches if that hop uses ZMQ; micro-batches use
``port + 1`` for PUB/SUB data and ``port + 2`` for the startup READY barrier.
This assumes a trusted trainer network; ZMQ messages are not authenticated.
"""

type: Literal["zmq"] = "zmq"

host: str = "localhost"
"""Host address for ZMQ transport."""
host: str | None = None
"""Host address receivers/senders connect to. When unset or ``0.0.0.0``, resolves to ``MASTER_ADDR`` or ``localhost``."""

port: int = 5555
"""Base port for ZMQ transport."""

hwm: int = 10
hwm: int = Field(64, ge=1)
"""High-water mark (max in-flight messages per ZMQ socket)."""

recv_timeout_seconds: int = Field(300, ge=1)
"""Seconds a micro-batch receiver waits after the master has published a step before failing fast."""

ready_timeout_seconds: int = Field(300, ge=1)
"""Seconds the micro-batch sender waits at startup for rank READY messages before failing fast."""

publish_grace_ms: int = Field(1000, ge=0)
"""One-time startup grace after all READY messages arrive, before the first publish, to let
PUB/SUB subscriptions propagate and avoid step-0 slow-joiner drops. Conservative by default
since it is a one-time cost; lower it once a topology is observed to start cleanly."""


TransportConfig: TypeAlias = Annotated[FileSystemTransportConfig | ZMQTransportConfig, Field(discriminator="type")]
33 changes: 33 additions & 0 deletions packages/prime-rl-configs/src/prime_rl/configs/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Annotated, Any, Literal, TypeAlias

from pydantic import Field, model_validator
from renderers import AutoRendererConfig, RendererConfig

from prime_rl.configs.shared import (
BaseModelConfig,
Expand All @@ -12,6 +13,7 @@
TrainerLogConfig,
TransportConfig,
WandbConfig,
ZMQTransportConfig,
)
from prime_rl.utils.config import BaseConfig

Expand Down Expand Up @@ -522,6 +524,9 @@ class TrainerConfig(BaseConfig):
rollout_transport: TransportConfig = FileSystemTransportConfig()
"""Transport used to ship rollouts from orchestrator to trainer."""

micro_batch_transport: TransportConfig = ZMQTransportConfig()
"""Transport used to ship packed per-rank micro-batches from the trainer master to data ranks."""

log: TrainerLogConfig = TrainerLogConfig()

wandb: WandbConfig | None = None
Expand Down Expand Up @@ -562,6 +567,15 @@ class TrainerConfig(BaseConfig):
max_concurrent_runs: int = Field(1, ge=1)
"""Maximum number of concurrent runs to allow. If 1, only one run may run at a time."""

defer_mm_materialization: bool = True
"""Defer multimodal pixel materialization from the orchestrator to the trainer. When True, the orchestrator ships lightweight image references (``mm_refs``) and the trainer materializes pixels in its data loader. Must match the orchestrator's setting; requires ``renderer`` to be set for VLM runs. A no-op for text-only runs (no ``mm_refs`` ever arrive)."""

pack_multimodal: bool = True
"""Pack multimodal samples together when the active model path supports packed multimodal position boundaries. Default-on, but the trainer gates it off for unsupported VLM/HF MRoPE paths, non-varlen attention, or context parallelism."""

renderer: RendererConfig | None = AutoRendererConfig()
"""Typed renderer config (``renderers.RendererConfig`` discriminated union), mirroring the orchestrator's. Auto-resolves from the model by default so VLM defer runs work without restating it; only used by VLM runs (text-only ignores it)."""

experimental: TrainerExperimentalConfig = TrainerExperimentalConfig()

@model_validator(mode="after")
Expand Down Expand Up @@ -673,3 +687,22 @@ def router_replay_only_with_custom_impl(self):
raise ValueError("Router replay is only supported with the custom implementation or auto mode")

return self

@model_validator(mode="after")
def validate_defer_mm_materialization(self):
if not self.defer_mm_materialization:
return self
# Multi-run IS supported: synchronous trainer-side materialization is
# run-agnostic (all concurrent runs are LoRA adapters on the same base
# model → same image processor; mm_refs are self-contained per sample),
# and it does NOT touch the per-run ready_to_update/progress machinery in
# the packer. (A future prefetch/late-commit path WOULD need the multi-run
# ready_to_update state split — guard that there, not on the flag.)
# Only VLM runs materialize pixels; text-only runs never receive
# ``mm_refs``, so default-on is a harmless no-op for them.
if self.renderer is None and self.model.vlm is not None:
raise ValueError(
"defer_mm_materialization requires a renderer config so the trainer can "
"materialize pixels identically to the orchestrator. Set [renderer]."
)
return self
21 changes: 13 additions & 8 deletions src/prime_rl/entrypoints/rl.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,19 +197,24 @@ def sigterm_handler(signum, frame):
orchestrator_cmd = ["orchestrator", "@", (config_dir / ORCHESTRATOR_TOML).as_posix()]
logger.info("Starting orchestrator process")
logger.debug(f"Orchestrator start command: {' '.join(orchestrator_cmd)}")
from verifiers.utils.native_threads import native_thread_limited_env

orchestrator_env = native_thread_limited_env(
{
**os.environ,
**wandb_shared_env,
"WANDB_SHARED_LABEL": "orchestrator",
"LOGURU_FORCE_COLORS": "1",
"WANDB_PROGRAM": "uv run rl",
"WANDB_ARGS": json.dumps(start_command),
}
)
with open(log_dir / "orchestrator.log", "w") as log_file:
orchestrator_process = Popen(
orchestrator_cmd,
stdout=log_file,
stderr=log_file,
env={
**os.environ,
**wandb_shared_env,
"WANDB_SHARED_LABEL": "orchestrator",
"LOGURU_FORCE_COLORS": "1",
"WANDB_PROGRAM": "uv run rl",
"WANDB_ARGS": json.dumps(start_command),
},
env=orchestrator_env,
)
processes.append(orchestrator_process)

Expand Down
Loading