PrimeIntellect-ai · hubert-marek · May 27, 2026 · May 27, 2026 · May 29, 2026 · May 29, 2026
diff --git a/Dockerfile.cuda b/Dockerfile.cuda
@@ -86,7 +86,11 @@ ARG TARGETARCH
 COPY scripts/docker-arm64-post-install.sh /app/scripts/docker-arm64-post-install.sh
 RUN if [ "$TARGETARCH" = "arm64" ]; then /app/scripts/docker-arm64-post-install.sh; fi
 
-FROM python:3.12-slim
+# Pin Debian 12 (bookworm, glibc 2.36) — the bare `python:3.12-slim` tag tracks
+# Debian stable, which moved to trixie (glibc ~2.41) and broke the runtime FLA
+# TileLang nvcc JIT (`bits/mathcalls.h: cospi/sinpi noexcept` conflict vs the
+# mounted CUDA host headers). bookworm's glibc matches the ubuntu22.04 builder.
+FROM python:3.12-slim-bookworm
 
 RUN apt-get update && apt-get install -y \
     --no-install-recommends \

diff --git a/deps/renderers b/deps/renderers
diff --git a/deps/verifiers b/deps/verifiers
diff --git a/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py b/packages/prime-rl-configs/src/prime_rl/configs/orchestrator.py
@@ -588,6 +588,9 @@ def _preserve_mito_renderer(self, handler: SerializerFunctionWrapHandler) -> dic
     output_dir: Path = Path("outputs/run_default")
     """Directory to write outputs to — checkpoints, weights, rollouts, and logs are written as subdirectories. Should be a persistent directory with enough disk space and unique per experiment running on a single node."""
 
+    mm_artifact_ttl_seconds: float = 1800.0
+    """TTL (seconds) for offloaded multimodal ``mm_features`` artifacts under ``output_dir/assets/mm_features``. Once per step the orchestrator deletes feature files older than this. Features ONLY: source images under ``assets/images`` are never swept (they are terminal browser output with no regeneration path and are kept for the whole run as the recoverable source). Features are a regenerable cache (trainer rebuilds pixels from the image; env-worker rewrites missing features on demand), so over-eviction just forces a reprocess. The TTL only needs to exceed the write→vLLM-admit window (seconds), so minutes leave a large safety margin against racing in-flight reads. Defaults to 30 minutes."""
+
     tasks_per_minute: int | None = Field(None, ge=1)
     """Rate limit per environment worker, in tasks per minute. Recommended for sandbox-backed environments to prevent sandbox-not-ready errors during autoscaling. With multiple workers, the effective total rate is ``workers × this value``. None disables rate limiting."""
 
@@ -619,6 +622,9 @@ def _preserve_mito_renderer(self, handler: SerializerFunctionWrapHandler) -> dic
     max_off_policy_steps: int = Field(8, ge=0)
     """Maximum policies allowed to generate a single rollout. Rollouts generated more than ``max_off_policy_steps`` ahead of training are discarded. Higher values yield better throughput at the cost of off-policy noise."""
 
+    defer_mm_materialization: bool = True
+    """Defer multimodal pixel materialization to the trainer. When True, the orchestrator ships lightweight image references (``mm_refs``) instead of materializing pixels and shipping heavy ``mm_kwargs``. Must match the trainer's setting. A no-op for text-only runs; forced off for SFT."""
+
     bench: bool = False
     """Benchmark mode. Sets ``max_steps`` to 5 and disables W&B."""
 
@@ -760,6 +766,9 @@ def _force_no_renderer_for_sft(self):
         validators below so they see the corrected value."""
         if self.training_mode == "sft":
             self.renderer = None
+            # SFT has no renderer, so it can't defer materialization; keep the
+            # default-on flag from tripping the renderer-required validator.
+            self.defer_mm_materialization = False
         return self
 
     @model_validator(mode="after")
@@ -836,6 +845,19 @@ def validate_renderer_auto_resolves(self):
             f"client entirely (MITO)."
         )
 
+    @model_validator(mode="after")
+    def validate_defer_mm_materialization(self):
+        """Deferred materialization needs a renderer so the descriptor it ships
+        in ``mm_refs`` is reproducible by the trainer's identical renderer."""
+        # Only VLM runs emit mm_refs; text-only runs never do, so default-on is
+        # a harmless no-op for them even if the renderer is opted out.
+        if self.defer_mm_materialization and self.renderer is None and self.student.model.vlm is not None:
+            raise ValueError(
+                "orchestrator.defer_mm_materialization requires a renderer so the trainer can "
+                "materialize pixels identically from the shipped image references."
+            )
+        return self
+
     @model_validator(mode="after")
     def resolve_batching(self):
         has_rollout_batch = self.batch_size is not None

diff --git a/packages/prime-rl-configs/src/prime_rl/configs/shared.py b/packages/prime-rl-configs/src/prime_rl/configs/shared.py
@@ -249,16 +249,34 @@ class FileSystemTransportConfig(BaseTransportConfig):
 
 
 class ZMQTransportConfig(BaseTransportConfig):
+    """
+    ZMQ binds on all local interfaces and connects to ``host`` (or ``MASTER_ADDR`` when unset).
+    Base ``port`` is used for training batches if that hop uses ZMQ; micro-batches use
+    ``port + 1`` for PUB/SUB data and ``port + 2`` for the startup READY barrier.
+    This assumes a trusted trainer network; ZMQ messages are not authenticated.
+    """
+
     type: Literal["zmq"] = "zmq"
 
-    host: str = "localhost"
-    """Host address for ZMQ transport."""
+    host: str | None = None
+    """Host address receivers/senders connect to. When unset or ``0.0.0.0``, resolves to ``MASTER_ADDR`` or ``localhost``."""
 
     port: int = 5555
     """Base port for ZMQ transport."""
 
-    hwm: int = 10
+    hwm: int = Field(64, ge=1)
     """High-water mark (max in-flight messages per ZMQ socket)."""
 
+    recv_timeout_seconds: int = Field(300, ge=1)
+    """Seconds a micro-batch receiver waits after the master has published a step before failing fast."""
+
+    ready_timeout_seconds: int = Field(300, ge=1)
+    """Seconds the micro-batch sender waits at startup for rank READY messages before failing fast."""
+
+    publish_grace_ms: int = Field(1000, ge=0)
+    """One-time startup grace after all READY messages arrive, before the first publish, to let
+    PUB/SUB subscriptions propagate and avoid step-0 slow-joiner drops. Conservative by default
+    since it is a one-time cost; lower it once a topology is observed to start cleanly."""
+
 
 TransportConfig: TypeAlias = Annotated[FileSystemTransportConfig | ZMQTransportConfig, Field(discriminator="type")]
diff --git a/packages/prime-rl-configs/src/prime_rl/configs/trainer.py b/packages/prime-rl-configs/src/prime_rl/configs/trainer.py
@@ -3,6 +3,7 @@
 from typing import Annotated, Any, Literal, TypeAlias
 
 from pydantic import Field, model_validator
+from renderers import AutoRendererConfig, RendererConfig
 
 from prime_rl.configs.shared import (
     BaseModelConfig,
@@ -12,6 +13,7 @@
     TrainerLogConfig,
     TransportConfig,
     WandbConfig,
+    ZMQTransportConfig,
 )
 from prime_rl.utils.config import BaseConfig
 
@@ -522,6 +524,9 @@ class TrainerConfig(BaseConfig):
     rollout_transport: TransportConfig = FileSystemTransportConfig()
     """Transport used to ship rollouts from orchestrator to trainer."""
 
+    micro_batch_transport: TransportConfig = ZMQTransportConfig()
+    """Transport used to ship packed per-rank micro-batches from the trainer master to data ranks."""
+
     log: TrainerLogConfig = TrainerLogConfig()
 
     wandb: WandbConfig | None = None
@@ -562,6 +567,15 @@ class TrainerConfig(BaseConfig):
     max_concurrent_runs: int = Field(1, ge=1)
     """Maximum number of concurrent runs to allow. If 1, only one run may run at a time."""
 
+    defer_mm_materialization: bool = True
+    """Defer multimodal pixel materialization from the orchestrator to the trainer. When True, the orchestrator ships lightweight image references (``mm_refs``) and the trainer materializes pixels in its data loader. Must match the orchestrator's setting; requires ``renderer`` to be set for VLM runs. A no-op for text-only runs (no ``mm_refs`` ever arrive)."""
+
+    pack_multimodal: bool = True
+    """Pack multimodal samples together when the active model path supports packed multimodal position boundaries. Default-on, but the trainer gates it off for unsupported VLM/HF MRoPE paths, non-varlen attention, or context parallelism."""
+
+    renderer: RendererConfig | None = AutoRendererConfig()
+    """Typed renderer config (``renderers.RendererConfig`` discriminated union), mirroring the orchestrator's. Auto-resolves from the model by default so VLM defer runs work without restating it; only used by VLM runs (text-only ignores it)."""
+
     experimental: TrainerExperimentalConfig = TrainerExperimentalConfig()
 
     @model_validator(mode="after")
@@ -673,3 +687,22 @@ def router_replay_only_with_custom_impl(self):
             raise ValueError("Router replay is only supported with the custom implementation or auto mode")
 
         return self
+
+    @model_validator(mode="after")
+    def validate_defer_mm_materialization(self):
+        if not self.defer_mm_materialization:
+            return self
+        # Multi-run IS supported: synchronous trainer-side materialization is
+        # run-agnostic (all concurrent runs are LoRA adapters on the same base
+        # model → same image processor; mm_refs are self-contained per sample),
+        # and it does NOT touch the per-run ready_to_update/progress machinery in
+        # the packer. (A future prefetch/late-commit path WOULD need the multi-run
+        # ready_to_update state split — guard that there, not on the flag.)
+        # Only VLM runs materialize pixels; text-only runs never receive
+        # ``mm_refs``, so default-on is a harmless no-op for them.
+        if self.renderer is None and self.model.vlm is not None:
+            raise ValueError(
+                "defer_mm_materialization requires a renderer config so the trainer can "
+                "materialize pixels identically to the orchestrator. Set [renderer]."
+            )
+        return self
diff --git a/src/prime_rl/entrypoints/rl.py b/src/prime_rl/entrypoints/rl.py
@@ -197,19 +197,24 @@ def sigterm_handler(signum, frame):
         orchestrator_cmd = ["orchestrator", "@", (config_dir / ORCHESTRATOR_TOML).as_posix()]
         logger.info("Starting orchestrator process")
         logger.debug(f"Orchestrator start command: {' '.join(orchestrator_cmd)}")
+        from verifiers.utils.native_threads import native_thread_limited_env
+
+        orchestrator_env = native_thread_limited_env(
+            {
+                **os.environ,
+                **wandb_shared_env,
+                "WANDB_SHARED_LABEL": "orchestrator",
+                "LOGURU_FORCE_COLORS": "1",
+                "WANDB_PROGRAM": "uv run rl",
+                "WANDB_ARGS": json.dumps(start_command),
+            }
+        )
         with open(log_dir / "orchestrator.log", "w") as log_file:
             orchestrator_process = Popen(
                 orchestrator_cmd,
                 stdout=log_file,
                 stderr=log_file,
-                env={
-                    **os.environ,
-                    **wandb_shared_env,
-                    "WANDB_SHARED_LABEL": "orchestrator",
-                    "LOGURU_FORCE_COLORS": "1",
-                    "WANDB_PROGRAM": "uv run rl",
-                    "WANDB_ARGS": json.dumps(start_command),
-                },
+                env=orchestrator_env,
             )
         processes.append(orchestrator_process)
+4 −0		renderers/base.py
+336 −61		renderers/client.py
+82 −9		renderers/configs.py
+4 −3		renderers/kimi_k25.py
+347 −0		renderers/mm_store.py
+31 −20		renderers/qwen35.py
+477 −23		renderers/qwen3_vl.py
+350 −22		tests/test_client.py