Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
15d29ee
feat(dynamo): AdminAPI abstraction + dynamo backend selector + RL wor…
biswapanda Jun 6, 2026
a1e0c5f
feat(weight-transfer): NCCL broadcast + FP8/E8M0 conversion for GB200…
biswapanda Jun 6, 2026
685c13a
feat(broadcast): NFS-safe filesystem broadcast + weight_broadcast.kee…
biswapanda Jun 6, 2026
4f5ba0b
feat(orchestrator): dispatch compute_teacher_logprobs by renderer_tra…
biswapanda Jun 6, 2026
7e4bc48
feat(inference): vLLM 0.22 patches (fp32 lm-head, int64 silu_mul_quan…
biswapanda Jun 6, 2026
9f2d880
build(deps): point verifiers/renderers submodules at biswapanda rl-sd…
biswapanda Jun 6, 2026
a03a559
build(image): Dockerfile.cuda.runtime (vLLM 0.22, COPY deps, DeepGEMM…
biswapanda Jun 6, 2026
7643298
feat(deploy): helm chart updates + dynamo k8s manifests + smoke-test …
biswapanda Jun 6, 2026
49a4d1f
fix(rl): rename renderer_transport values to vllm_generate/dynamo_cha…
biswapanda Jun 9, 2026
e75f343
fix(routed_experts): carry dtype for models with over 256 experts
biswapanda Jun 10, 2026
cbc2792
feat(inference): forward moe_backend to vLLM so router-replay capture…
biswapanda Jun 10, 2026
788281c
fix(routed_experts): int32 fallback for >65535 experts, normalize uin…
biswapanda Jun 10, 2026
13e411d
fix(routed_experts): preserve per-model dtype so batch packing stays …
biswapanda Jun 10, 2026
2c61937
fix(k8s): drop trainer nvidia.com/gpu request when trainer DRA resour…
biswapanda Jun 10, 2026
1b5917a
fix(k8s): set backend=dynamo in the dynamo-deploy client example
biswapanda Jun 10, 2026
6f42560
chore(deps): bump verifiers/renderers submodules to rl-sdk-4 routed_e…
biswapanda Jun 11, 2026
f8f42f8
chore(deps): restore submodules and .gitmodules to upstream, dropping…
biswapanda Jun 11, 2026
8c837e6
rm extra files
biswapanda Jun 11, 2026
08bb4ea
rm unnecessary files
biswapanda Jun 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 189 additions & 0 deletions Dockerfile.cuda.runtime
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
# Multi-stage Dockerfile for prime-rl with NVRTC support on GB200 (sm_100a).
#
# WHY THIS EXISTS (separate from Dockerfile.cuda):
# The original Dockerfile.cuda uses `python:3.12-slim` as the runtime base.
# That image has no CUDA toolkit, so tilelang's JIT path (which compiles the
# sparse-MLA kernels at runtime via NVRTC) fails with:
#
# atomic.h(7): catastrophic error: cannot open source file "cuda/atomic"
#
# `cuda/atomic` is a libcudacxx (CCCL) header. It is shipped only by the
# CUDA dev/devel toolkit, not by the `nvidia-cuda-*` pip wheels. Without it
# NVRTC cannot compile any kernel that pulls in tilelang's atomic.h.
#
# This Dockerfile uses NVIDIA's cuda-dl-base (devel) image for both stages,
# so the runtime image carries the libcudacxx / CCCL headers tilelang needs
# and the wheels in /app/.venv keep their CUDA 12.x ABI.
#
# BASE IMAGE:
# nvcr.io/nvidia/cuda-dl-base:25.06-cuda12.9-devel-ubuntu24.04
# - matches dynamo/container/context.yaml `prime-rl.cuda12.9` entry
# - ships Python 3.12 by default (Ubuntu 24.04)
# - includes libcudacxx, CCCL, cuDNN, nvcc, and full CUDA dev headers
# - forward-compatible with the CUDA 12.8 wheels pinned in uv.lock
#
# USAGE:
# docker buildx build --platform linux/arm64 \
# --build-arg TARGETARCH=arm64 \
# -f Dockerfile.cuda.runtime -t <tag> .

ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base
ARG BASE_IMAGE_TAG=25.06-cuda12.9-devel-ubuntu24.04

############################
##### Build stage ##########
############################
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS builder
LABEL maintainer="prime intellect"
LABEL repository="prime-rl"

# Set en_US.UTF-8 locale by default
RUN echo "LC_ALL=en_US.UTF-8" >> /etc/environment

# CUDA_HOME / PATH from base image are already correct (/usr/local/cuda), but
# pin them explicitly so downstream tooling (tilelang, flash-attn) sees them.
ENV CUDA_HOME=/usr/local/cuda
ENV PATH=$PATH:/usr/local/cuda/bin

# Install build tooling.
ARG DEBIAN_FRONTEND=noninteractive
ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=Etc/UTC
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
curl \
sudo \
git \
ninja-build \
&& apt-get clean autoclean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

# Install uv.
ADD https://astral.sh/uv/install.sh /uv-installer.sh
RUN INSTALLER_NO_MODIFY_PATH=1 UV_INSTALL_DIR="/usr/local/bin" sh /uv-installer.sh && rm /uv-installer.sh
ENV PATH="/usr/local/bin:$PATH"
ENV UV_PYTHON_INSTALL_DIR="/usr/local/share/uv/python"
ENV UV_CACHE_DIR="/usr/local/share/uv/cache"

# Install Python dependencies (gradual copies help with caching).
WORKDIR /app

ENV UV_COMPILE_BYTECODE=1 UV_LINK_MODE=copy

COPY pyproject.toml /app/pyproject.toml
COPY uv.lock /app/uv.lock
COPY README.md /app/README.md
COPY src/ /app/src/
COPY packages/ /app/packages/
COPY deps/ /app/deps/
COPY configs /app/configs
COPY examples /app/examples
COPY benchmarks/scripts /app/benchmarks/scripts

RUN --mount=type=cache,target=/app/.cache/uv \
uv sync --extra flash-attn --extra flash-attn-3 --extra flash-attn-cute --extra envs --extra gpt-oss --group mamba-ssm --locked --no-dev

# arm64: build flash-attn + DeepGEMM from source.
ARG TARGETARCH
COPY scripts/docker-arm64-post-install.sh /app/scripts/docker-arm64-post-install.sh
COPY scripts/install_deep_gemm.sh /app/scripts/install_deep_gemm.sh
RUN if [ "$TARGETARCH" = "arm64" ]; then /app/scripts/docker-arm64-post-install.sh; fi

# vLLM PR #39366 (two-phase DP pause) is native in vLLM 0.22 — no patch needed
# (the rl-sdk-4 merge bumped vLLM 0.20.2 -> 0.22 and dropped this patch).

############################
##### Runtime stage ########
############################
# Same image so libcudacxx, CCCL headers, and the system Python 3.12 are all
# present at runtime — the original Dockerfile.cuda switched to python:3.12-slim
# here and lost the CUDA dev headers, which broke tilelang's NVRTC backend.
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG}

ENV CUDA_HOME=/usr/local/cuda
ENV PATH=/usr/local/cuda/bin:$PATH

ARG DEBIAN_FRONTEND=noninteractive
ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=Etc/UTC

RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
wget \
clang \
tmux \
iperf \
openssh-server \
git \
git-lfs \
gpg \
sudo \
iputils-ping \
net-tools \
curl \
vim \
libibverbs1 \
ibverbs-providers \
python3.12 \
python3.12-venv \
&& apt-get clean autoclean \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

# Ensure `python` / `python3` point at 3.12 (ubuntu24.04 ships 3.12 already,
# but the symlinks aren't created by default).
RUN ln -sf /usr/bin/python3.12 /usr/local/bin/python \
&& ln -sf /usr/bin/python3.12 /usr/local/bin/python3 \
&& ln -sf /usr/bin/python3.12 /usr/local/bin/python3.12

ARG USER_ID=1000
ARG GROUP_ID=1000
# Ubuntu 24.04 ships a default `ubuntu` user at uid 1000; remove it so the
# explicit appuser keeps uid/gid 1000 (matches Dockerfile.cuda + k8s manifests).
RUN userdel -r ubuntu 2>/dev/null || true \
&& groupadd --gid $GROUP_ID appuser \
&& useradd --uid $USER_ID --gid appuser --create-home --shell /bin/bash appuser \
&& usermod -aG sudo appuser \
&& echo "appuser ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers

# Install uv for development use.
ADD https://astral.sh/uv/install.sh /uv-installer.sh
RUN INSTALLER_NO_MODIFY_PATH=1 UV_INSTALL_DIR="/usr/local/bin" sh /uv-installer.sh && rm /uv-installer.sh

USER appuser
ENV PATH="/usr/local/bin:$PATH"
WORKDIR /app
# Copy the application + venv from the builder.
COPY --from=builder --chown=appuser:appuser /app /app

# Copy and set up entrypoint script.
COPY --chown=appuser:appuser scripts/docker-entrypoint.sh /app/docker-entrypoint.sh
RUN chmod +x /app/docker-entrypoint.sh

# Repoint venv Python symlinks at the runtime-stage interpreter (the builder
# used a uv-managed Python that does not exist here).
RUN rm /app/.venv/bin/python && ln -s /usr/bin/python3.12 /app/.venv/bin/python
RUN rm /app/.venv/bin/python3 && ln -s /usr/bin/python3.12 /app/.venv/bin/python3
RUN rm /app/.venv/bin/python3.12 && ln -s /usr/bin/python3.12 /app/.venv/bin/python3.12

# python3.12-dev: Python.h headers for vLLM's Triton CudaUtils JIT-compile at runtime.
# Required by vLLM serving (dynamo.vllm / inference); absent from the cuda-dl-base
# runtime, which crashed inference with `fatal error: Python.h: No such file or directory`.
USER root
RUN apt-get update && apt-get install -y --no-install-recommends python3.12-dev \
&& apt-get clean autoclean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
USER appuser

# Place executables in the environment at the front of the path.
ENV PATH="/app/.venv/bin:$PATH"

# HuggingFace Hub timeouts (defaults are 10s which causes issues on slow networks).
ENV HF_HUB_ETAG_TIMEOUT=500
ENV HF_HUB_DOWNLOAD_TIMEOUT=300

# Enable FP8 grouped-GEMM kernels in vLLM MoE layers (requires DeepGEMM, built
# during the arm64 post-install step above).
ENV VLLM_USE_DEEP_GEMM=1
ENV VLLM_MOE_USE_DEEP_GEMM=1

# Use entrypoint for setup (ulimit, etc) but default to sleep infinity for K8s.
ENTRYPOINT ["/app/docker-entrypoint.sh"]
CMD ["sleep", "infinity"]
90 changes: 90 additions & 0 deletions k8s/dynamo-deploy/dynamo-dgd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Example DynamoGraphDeployment for serving inference to prime-rl.
# kubectl apply -f dynamo-dgd.yaml -n <your-namespace>
#
# Replace `<your-namespace>`, `<your-registry>/dynamo:<vllm-runtime-tag>`,
# `<your-image-pull-secret>`, and the model name below for your environment.
# Requires DYN_ENABLE_RL=true on the Dynamo runtime so /v1/rl/* endpoints are
# served natively.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: prime-rl-dynamo
namespace: <your-namespace>
spec:
backendFramework: vllm
pvcs:
- create: false
name: model-cache
services:
Frontend:
componentType: frontend

extraPodSpec:
imagePullSecrets:
- name: <your-image-pull-secret>
mainContainer:
image: <your-registry>/dynamo:<vllm-runtime-tag>
startupProbe:
failureThreshold: 360
httpGet:
path: /health
port: 8000
periodSeconds: 10
timeoutSeconds: 5
replicas: 1
volumeMounts:
- mountPoint: /model-cache
name: model-cache
VllmWorker:
componentType: worker

envFromSecret: hf-token-secret
extraPodSpec:
imagePullSecrets:
- name: <your-image-pull-secret>
mainContainer:
command:
- python3
- -m
- dynamo.vllm
args:
- --model
- Qwen/Qwen3-4B-Instruct-2507
- --served-model-name
- Qwen/Qwen3-4B-Instruct-2507
- --tensor-parallel-size
- "1"
- --max-model-len
- "2048"
- --max-num-seqs
- "64"
- --gpu-memory-utilization
- "0.90"
- --enforce-eager
env:
- name: HF_HOME
value: /model-cache/huggingface
image: <your-registry>/dynamo:<vllm-runtime-tag>
startupProbe:
failureThreshold: 360
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 10
workingDir: /workspace/examples/backends/vllm
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
replicas: 1
resources:
limits:
gpu: "1"
requests:
gpu: "1"
sharedMemory:
size: 16Gi
volumeMounts:
- mountPoint: /model-cache
name: model-cache
60 changes: 60 additions & 0 deletions k8s/dynamo-deploy/prime-rl-configs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Example ConfigMap mounted at /configs in the orchestrator and trainer pods.
# kubectl apply -f prime-rl-configs.yaml -n <your-namespace>
#
# Replace `<your-namespace>` below with your namespace, and adjust the
# Dynamo frontend service hostname if it differs in your cluster.
apiVersion: v1
kind: ConfigMap
metadata:
name: prime-rl-configs
namespace: <your-namespace>
data:
orch.toml: |
max_steps = 20
seq_len = 2048
batch_size = 64
rollouts_per_example = 4
use_token_client = false

[wandb]
project = "prime-rl-dynamo-k8s"
name = "dynamo-smoke-qwen3-4b"

[model]
name = "Qwen/Qwen3-4B-Instruct-2507"

[sampling]
max_tokens = 256

[[env]]
id = "math-env"
name = "hendrycks-math"
args = { dataset_name = "PrimeIntellect/Hendrycks-Math", dataset_subset = "default", math_verify_max_workers = 32, math_verify_timeout = 60 }

[buffer]
easy_threshold = 1.0
hard_threshold = 0.0

[client]
# Use the Dynamo admin/rollout transport (/engine/* + nvext), not the
# default vLLM frontend — required for the Dynamo /v1/rl/* admin routes below.
backend = "dynamo"
base_url = ["http://prime-rl-dynamo-frontend.<your-namespace>.svc.cluster.local:8000/v1"]
# Admin endpoints (/v1/rl/*) are served natively by the Dynamo Rust frontend
# (DYN_ENABLE_RL=true). No separate admin-stub service needed.
admin_base_url = ["http://prime-rl-dynamo-frontend.<your-namespace>.svc.cluster.local:8000/v1/rl"]
skip_model_check = true
Comment thread
biswapanda marked this conversation as resolved.

train.toml: |
max_steps = 20

[model]
name = "Qwen/Qwen3-4B-Instruct-2507"
seq_len = 2048

[wandb]
project = "prime-rl-dynamo-k8s"
name = "dynamo-smoke-qwen3-4b-trainer"

[optim]
lr = 3e-6
Loading