diff --git a/.github/workflows/build-podvm-cohere.yaml b/.github/workflows/build-podvm-cohere.yaml index dfabc84e2e..6c5f23b7fc 100644 --- a/.github/workflows/build-podvm-cohere.yaml +++ b/.github/workflows/build-podvm-cohere.yaml @@ -22,10 +22,20 @@ on: type: string default: "https://github.com/cohere-ai/guest-components.git" guest_components_ref: - description: "guest-components ref (default: cohere)" + description: | + guest-components ref (branch, tag, or SHA). + + Default: alhassankhedr/sync-main-to-cohere (head of PR #9). + That branch carries upstream main's nvidia-attester rewrite + (NVAT SDK based, no `count == 1` guard) and is required for + multi-GPU evidence to work end-to-end on 8x B200 hosts. The + plain `cohere` branch still has the old NVML-based attester + which silently produces empty evidence on 2+ GPU systems + (mod a sed `s/count == 1/count >= 1/` patch the podvm-mkosi + Dockerfile applies). Switch back to `cohere` after PR #9 merges. required: false type: string - default: "cohere" + default: "alhassankhedr/sync-main-to-cohere" custom_gc_binaries: description: "guest-components binaries to build from source" required: false @@ -46,6 +56,11 @@ on: required: false type: boolean default: false + b200_cc_drivers: + description: "Install NVIDIA 595.71.05 open driver (enables Confidential Computing on multi-GPU B200). EXPERIMENTAL" + required: false + type: boolean + default: false permissions: id-token: write # OIDC token for build provenance attestation @@ -70,11 +85,13 @@ jobs: image_name_debug: ${{ steps.compute.outputs.image_name_debug }} image_tag_release: ${{ steps.compute.outputs.image_tag_release }} image_tag_debug: ${{ steps.compute.outputs.image_tag_debug }} + b200_cc_drivers: ${{ steps.compute.outputs.b200_cc_drivers }} steps: - name: Compute tags and image names id: compute env: DISTRO: ${{ inputs.distro || 'ubuntu' }} + B200_CC_DRIVERS: ${{ inputs.b200_cc_drivers && 'true' || 'false' }} run: | if [[ "$GITHUB_REF" == refs/tags/podvm-v* ]]; then TAG="${GITHUB_REF#refs/tags/podvm-}" @@ -84,10 +101,15 @@ jobs: REPLACE_IMAGE="true" fi TAG="${TAG//./-}" + # Suffix CC-driver builds so they never collide with standard images + if [ "$B200_CC_DRIVERS" = "true" ]; then + TAG="${TAG}-cc595" + fi { echo "tag=$TAG" echo "distro=$DISTRO" echo "replace_image=$REPLACE_IMAGE" + echo "b200_cc_drivers=$B200_CC_DRIVERS" echo "image_name_release=podvm-${DISTRO}-${TEE_PLATFORM}-release-${TAG}" echo "image_name_debug=podvm-${DISTRO}-${TEE_PLATFORM}-debug-${TAG}" echo "image_tag_release=${TAG}-${DISTRO}-release" @@ -176,7 +198,7 @@ jobs: PODVM_DISTRO: ${{ needs.meta.outputs.distro }} AA_FEATURES: ${{ inputs.aa_features || 'bin,ttrpc,kbs,coco_as,rust-crypto,tdx-attester,nvidia-attester' }} GC_REPO: ${{ inputs.guest_components_repo || 'https://github.com/cohere-ai/guest-components.git' }} - GC_REF: ${{ inputs.guest_components_ref || 'cohere' }} + GC_REF: ${{ inputs.guest_components_ref || 'alhassankhedr/sync-main-to-cohere' }} GC_CUSTOM_BINARIES: ${{ inputs.custom_gc_binaries || 'attestation-agent,api-server-rest' }} run: | MAKE_ARGS=( @@ -195,6 +217,47 @@ jobs: echo "Disk after binaries build:" df -h / + - name: Override NVIDIA driver to 595.71.05 (B200 multi-GPU CC) + if: needs.meta.outputs.b200_cc_drivers == 'true' + working-directory: src/cloud-api-adaptor/podvm-mkosi + run: | + set -euo pipefail + CONF=mkosi.presets/system/mkosi.conf.d/ubuntu.conf + # The 595 branch only ships the unversioned `nvidia-driver-open` + # metapackage in NVIDIA's CUDA repo (no `nvidia-driver-595-open`). + # Match by package name only so this survives future 580.x.y bumps. + sed -i -E \ + -e 's|^([[:space:]]*)nvidia-driver-580-open=.*|\1nvidia-driver-open=595.71.05-1ubuntu1|' \ + -e 's|^([[:space:]]*)nvidia-persistenced=.*|\1nvidia-persistenced=595.71.05-1ubuntu1|' \ + -e 's|^([[:space:]]*)nvidia-fabricmanager=.*|\1nvidia-fabricmanager=595.71.05-1ubuntu1|' \ + -e 's|^([[:space:]]*)libnvidia-nscq=.*|\1libnvidia-nscq=595.71.05-1ubuntu1|' \ + "$CONF" + echo "----- Updated NVIDIA package pins -----" + grep -E '^[[:space:]]*(nvidia|libnvidia)' "$CONF" + + - name: Increase debug root partition for CC595 drivers + if: needs.meta.outputs.b200_cc_drivers == 'true' && matrix.profile == 'debug' + working-directory: src/cloud-api-adaptor/podvm-mkosi + run: | + set -euo pipefail + CONF=mkosi.presets/system/mkosi.repart-debug/10-root.conf + # NVIDIA 595 drivers make the root filesystem too large for + # systemd-repart's Minimize=guess estimation, causing mkfs.ext4 + # "No space left on device" during the build. + printf '[Partition]\nType=root\nFormat=ext4\nCopyFiles=/\nMinimize=off\nSizeMinBytes=12G\nSizeMaxBytes=12G\n' > "$CONF" + echo "----- Updated repart config -----" + cat "$CONF" + + - name: Resolve installed NVIDIA driver version + working-directory: src/cloud-api-adaptor/podvm-mkosi + run: | + set -euo pipefail + CONF=mkosi.presets/system/mkosi.conf.d/ubuntu.conf + DRIVER_LINE=$(grep -E '^[[:space:]]*nvidia-driver(-580)?-open=' "$CONF" | head -n1) + DRIVER_VER=$(printf '%s' "$DRIVER_LINE" | sed -E 's|.*=([0-9]+\.[0-9]+\.[0-9]+).*|\1|') + echo "Resolved NVIDIA driver version: $DRIVER_VER" + echo "NVIDIA_DRIVER=$DRIVER_VER" >> "$GITHUB_ENV" + - name: Build OS image working-directory: src/cloud-api-adaptor/podvm-mkosi env: @@ -265,6 +328,7 @@ jobs: --arg distro "$DISTRO" \ --arg profile "$PROFILE" \ --arg tee_platform "$TEE_PLATFORM" \ + --arg nvidia_driver "$NVIDIA_DRIVER" \ --arg build_date "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ '$ARGS.named' > /tmp/measurements.json @@ -307,6 +371,7 @@ jobs: --annotation "com.cohere.caa.commit=${CAA_COMMIT}" \ --annotation "com.cohere.caa.version=${GITHUB_REF_NAME}" \ --annotation "com.cohere.rtmr2=${RTMR2}" \ + --annotation "com.cohere.nvidia.driver=${NVIDIA_DRIVER}" \ --format json > oras-output.json cat oras-output.json diff --git a/src/cloud-api-adaptor/podvm-mkosi/Dockerfile.mkosi.ubuntu b/src/cloud-api-adaptor/podvm-mkosi/Dockerfile.mkosi.ubuntu index ee25af5f79..22ab71a337 100644 --- a/src/cloud-api-adaptor/podvm-mkosi/Dockerfile.mkosi.ubuntu +++ b/src/cloud-api-adaptor/podvm-mkosi/Dockerfile.mkosi.ubuntu @@ -51,6 +51,25 @@ COPY mkosi.conf.ubuntu /image/mkosi.conf # Add NVIDIA APT repos to mkosi.skeleton/ so they are present in the image tree # *before* package installation. mkosi applies SkeletonTrees before apt-get runs. +# +# Three NVIDIA repos are wired in: +# 1. CUDA repo (developer.download.nvidia.com/compute/cuda) +# Provides nvidia-driver-open, nvidia-fabricmanager, libnvidia-nscq, +# nvidia-imex, libnvsdm, nvlsm, collectx-bringup, mft, mft-oem, +# mft-autocomplete, etc. — the full R595 + nvlink5 stack EXCEPT for +# the `ucx` dependency that collectx-bringup pulls in. +# 2. nvidia-container-toolkit repo (nvidia.github.io/libnvidia-container) +# Provides nvidia-container-toolkit and friends. +# 3. NVIDIA DOCA-Host networking repo (linux.mellanox.com/public/repo/doca) +# Provides `ucx` (Unified Communication X) and the matching MOFED +# userspace stack. We need it ONLY for `ucx` (so collectx-bringup's +# Depends: ucx resolves) and as an alternate source for mft*/ +# collectx-bringup. The repo is pinned at priority 100 so it acts as +# a fallback — packages already available from the cuda repo or from +# Ubuntu universe (e.g. rdma-core, ibverbs-utils, libibumad3) are +# NOT replaced by the MOFED variants, which would otherwise turn the +# image into a MOFED-userspace install. Only ucx (which exists ONLY +# in DOCA) and any explicitly-requested package fall through to it. RUN mkdir -p /image/mkosi.skeleton/etc/apt/sources.list.d \ /image/mkosi.skeleton/etc/apt/preferences.d \ /image/mkosi.skeleton/usr/share/keyrings \ @@ -58,10 +77,14 @@ RUN mkdir -p /image/mkosi.skeleton/etc/apt/sources.list.d \ -o /image/mkosi.skeleton/usr/share/keyrings/cuda-archive-keyring.gpg \ && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ | gpg --dearmor -o /image/mkosi.skeleton/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -fsSL https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox \ + | gpg --dearmor -o /image/mkosi.skeleton/usr/share/keyrings/nvidia-doca-keyring.gpg \ && echo 'deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/ /' \ > /image/mkosi.skeleton/etc/apt/sources.list.d/cuda-ubuntu2404-x86_64.list \ && echo 'deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://nvidia.github.io/libnvidia-container/stable/deb/amd64 /' \ > /image/mkosi.skeleton/etc/apt/sources.list.d/nvidia-container-toolkit.list \ + && echo 'deb [signed-by=/usr/share/keyrings/nvidia-doca-keyring.gpg] https://linux.mellanox.com/public/repo/doca/latest/ubuntu24.04/x86_64/ ./' \ + > /image/mkosi.skeleton/etc/apt/sources.list.d/nvidia-doca-ubuntu2404.list \ && printf '%s\n' \ 'Package: nvidia-*' \ 'Pin: origin developer.download.nvidia.com' \ @@ -74,7 +97,24 @@ RUN mkdir -p /image/mkosi.skeleton/etc/apt/sources.list.d \ 'Package: cuda-*' \ 'Pin: origin developer.download.nvidia.com' \ 'Pin-Priority: 1001' \ - > /image/mkosi.skeleton/etc/apt/preferences.d/nvidia-cuda-repo + > /image/mkosi.skeleton/etc/apt/preferences.d/nvidia-cuda-repo \ + && printf '%s\n' \ + '# Pin the NVIDIA DOCA-Host repo to priority 100. Apt rule for' \ + '# "Pin-Priority: 100": install ONLY if explicitly requested or as' \ + '# a dep, never auto-upgrade or replace a package available from' \ + '# a higher-priority source. Outcome:' \ + '# - ucx (only in DOCA) -> installs from DOCA ✓ (resolves' \ + '# collectx-bringup Depends: ucx)' \ + '# - rdma-core, ibverbs-utils, libibumad3 (in universe @500 AND' \ + '# in DOCA @100) -> installs from universe ✓' \ + '# (keeps inbox OFED, not MOFED)' \ + '# - collectx-bringup, mft* -> install from cuda repo (origin' \ + '# developer.download.nvidia.com,' \ + '# default priority 500 > 100)' \ + 'Package: *' \ + 'Pin: origin linux.mellanox.com' \ + 'Pin-Priority: 100' \ + > /image/mkosi.skeleton/etc/apt/preferences.d/nvidia-doca-repo RUN --security=insecure mkosi --profile=$PROFILE --image-version=$IMAGE_VERSION diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.postinst b/src/cloud-api-adaptor/podvm-mkosi/mkosi.postinst index 6dc3beb46f..4c93012367 100755 --- a/src/cloud-api-adaptor/podvm-mkosi/mkosi.postinst +++ b/src/cloud-api-adaptor/podvm-mkosi/mkosi.postinst @@ -2,6 +2,56 @@ set -euxo pipefail +# --------------------------------------------------------------------------- +# Distro guard: this image MUST be Ubuntu noble. +# +# The Makefile default is `PODVM_DISTRO ?= fedora`, so a bare +# `make image-debug` silently builds a Fedora qcow2. The Cohere B200 SVM +# stack assumes Ubuntu noble end-to-end: +# - mkosi.presets/system/mkosi.conf.d/ubuntu.conf pins 13 NVIDIA packages +# (nvidia-driver-open=595.71.05-1ubuntu1, nvidia-fabricmanager*, +# libnvsdm, nvidia-imex, libnvidia-nscq, nvlsm, collectx-bringup, mft*, +# rdma-core, ibverbs-utils, libibumad3, datacenter-gpu-manager-4-cuda12, +# nvidia-modprobe, nvidia-container-toolkit) from the CUDA + DOCA-Host +# apt repos. NONE of these resolve in Fedora's package universe. +# - service-vm/scripts/svc-vm-bootstrap.sh runtime install workaround +# uses `apt-get download` + `dpkg -x` (no rpm equivalent here). +# - service-vm/scripts/verify-svc-vm.sh hard-fail gates use dpkg-query, +# ldconfig with /lib/x86_64-linux-gnu/, and apt-style version strings. +# - The fmctl-probe bake below links against +# /usr/lib/x86_64-linux-gnu/libnvfm.so from nvidia-fabricmanager-dev, +# which does not exist in Fedora's NVIDIA repos. +# +# A Fedora qcow2 produced by this pipeline would silently lack every B200 +# component and the SVM would only fail at FM startup time -- AFTER the +# operator has copied a 5 GiB image into place and rebooted. Fail here +# instead, before mkosi finalizes the disk image. +# --------------------------------------------------------------------------- +osr="${BUILDROOT}/etc/os-release" +if [ -f "$osr" ]; then + distro_id=$(awk -F= '$1=="ID" {gsub(/"/,"",$2); print $2; exit}' "$osr") + if [ "$distro_id" != "ubuntu" ]; then + cat >&2 <partition-id resolver. +# Source is vendored at mkosi.skeleton/usr/src/fmctl-probe/fmctl-probe.cpp +# (canonical copy lives in fortress: scratch/oci-b200/.../orchestration/scripts/ +# fmctl-probe.cpp -- keep them in sync). +# +# We compile inside ${BUILDROOT} via chroot so the binary links against the +# rootfs's own libnvfm (from nvidia-fabricmanager-dev), guaranteeing ABI parity +# with the libnvfm.so loaded at runtime. Building from the outer Oracular +# builder would risk linking against a different libstdc++/libnvfm version. +# +# Removed from the image after compile: the source tree (~12 KiB) is build-time +# only; we don't want a build artifact source dir in production qcow2s. +# --------------------------------------------------------------------------- +FMCTL_SRC="${BUILDROOT}/usr/src/fmctl-probe/fmctl-probe.cpp" +FMCTL_HDR="${BUILDROOT}/usr/include/nv_fm_agent.h" +if [ -f "${FMCTL_SRC}" ] && [ -f "${FMCTL_HDR}" ]; then + echo "[postinst] compiling fmctl-probe inside rootfs against libnvfm" + chroot "${BUILDROOT}" /usr/bin/g++ \ + -std=c++17 -O2 -Wall -Wextra \ + /usr/src/fmctl-probe/fmctl-probe.cpp \ + -lnvfm \ + -o /usr/local/bin/fmctl-probe + chroot "${BUILDROOT}" /usr/bin/test -x /usr/local/bin/fmctl-probe + rm -rf "${BUILDROOT}/usr/src/fmctl-probe" + echo "[postinst] fmctl-probe baked at /usr/local/bin/fmctl-probe" +elif [ -f "${FMCTL_SRC}" ]; then + echo "[postinst] WARN: fmctl-probe source present but nv_fm_agent.h missing;" \ + "is nvidia-fabricmanager-dev in Packages=? Skipping bake." >&2 +fi diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.conf.d/ubuntu.conf b/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.conf.d/ubuntu.conf index ac0c9670cf..63966d123d 100644 --- a/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.conf.d/ubuntu.conf +++ b/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.conf.d/ubuntu.conf @@ -12,6 +12,7 @@ Packages= linux-image-generic-hwe-24.04 linux-headers-generic-hwe-24.04 gcc + g++ make kmod udev @@ -25,11 +26,149 @@ Packages= iptables e2fsprogs cryptsetup - nvidia-driver-580-open=580.126.20-1ubuntu1 - nvidia-persistenced=580.126.20-1ubuntu1 - nvidia-fabricmanager=580.126.20-1 - libnvidia-nscq=580.126.20-1 + nvidia-driver-open=595.71.05-1ubuntu1 + nvidia-persistenced=595.71.05-1ubuntu1 + nvidia-fabricmanager=595.71.05-1ubuntu1 + libnvidia-nscq=595.71.05-1ubuntu1 nvidia-container-toolkit=1.19.0-1 + nvidia-fabricmanager-dev=595.71.05-1ubuntu1 + nvlsm + # nvlink5-595 metapackage components — installed individually because + # the metapackage is deprecated. Per the NVIDIA FM User Guide + # §"Installing Fabric Manager / Systems Using Fourth Generation + # NVSwitches", the canonical B200/B300 install path used to be: + # sudo apt-get install -V nvidia-open- + # sudo apt-get install -V nvlink5- + # The nvlink5 metapackage has no files of its own — it just pulls in + # a fixed list of components (see + # https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/nvlink5-migration.html). + # NVIDIA's documented migration is to install those components + # explicitly; the dep list of nvlink5_595.71.05-1 is: + # libnvidia-nscq (>=595) — pinned above + # libnvsdm (>=595) — added below + # nvidia-fabricmanager (>=595) — pinned above + # nvidia-imex (>=595) — added below + # nvidia-dkms-open / nvidia-kernel-open-dkms (>=595) — via nvidia-driver-open + # libnvidia-compute / nvidia-driver-cuda (>=595) — via nvidia-driver-open + # nvlsm (>=2025.10.12) — pinned above + # collectx-bringup, mft, mft-oem, mft-autocomplete — INTENTIONALLY OMITTED, see note below + # + # Why each one matters (FM Guide §Shared NVSwitch Virtualization Model + # and §1259 NVSwitch Errors On DGX B200/B300): + # libnvsdm — NVSwitch Device Manager telemetry library; replaces + # the SXID error path on B200/B300. DCGM (added + # below) reads NVSwitch port/ASIC counters through + # it; without it NVSwitch faults are invisible. + # nvidia-imex — Internode Memory Exchange daemon. Brokers secure + # cross-OS-instance shared CUDA memory channels over + # NVLink. Required for multi-tenant Shared-NVSwitch + # workloads where NCCL group formation registers + # memory regions that cross partition boundaries; + # without IMEX the GPU's secure subsystem fires + # Xid 170 SECURE Fatal CROSS_CONTAIN. See fortress + # scratch/oci-b200/.../docs/KNOWN-ISSUES.md §1 + # "mode C" for the symptom this targets. + libnvsdm=595.71.05-1ubuntu1 + nvidia-imex=595.71.05-1ubuntu1 + # CX7 / NVSwitch firmware-management and telemetry tools. These come + # from NVIDIA's CUDA repo, but their `ucx` dependency comes from + # NVIDIA's DOCA-Host networking repo. Both repos are wired into the + # mkosi.skeleton apt sources by Dockerfile.mkosi.ubuntu (see comments + # there for the priority-100 fallback pin that makes this resolve + # without dragging in MOFED userspace). + # collectx-bringup CX7 bringup utilities for the NVLink-management + # bridge. Depends on ucx (now installable via + # DOCA repo). Used by FM/NVLSM internal startup + # scripts to query CX7 SMDL/VPD info. + # mft / mft-oem / Mellanox Firmware Tools (mst, flint, mlxconfig, + # mft-autocomplete mlxlink). Required for B200 LPF firmware + # diagnostics when the CX7 bridge fails to come + # up. nvlsm's prelaunch script uses mst to enum + # management ports. + collectx-bringup + mft + mft-oem + mft-autocomplete + # Userspace OFED bits that the FM User Guide "NVIDIA Software Packages" + # section requires on B200/B300 ("OFED or MOFED package is required"). + # rdma-core pulls in libibverbs1, librdmacm1, libibmad5, libibumad3, + # libibnetdisc5, etc. ibverbs-utils gives ibv_devices/ibv_devinfo for + # triage. infiniband-diags below already provides ibstatus/ibstat. + rdma-core + ibverbs-utils + infiniband-diags + # Pin libibumad3 explicitly. FM User Guide §"Other NVIDIA Software + # Packages" calls out libibumad3 by name as a B200/B300 SVM + # requirement. Today this lands transitively via infiniband-diags, + # but pinning it makes the package set hermetic against any future + # transitive-dep churn and lets verify-svc-vm.sh's + # /lib/x86_64-linux-gnu/libibumad.so.3 gate be deterministic. + libibumad3 + # Pin nvidia-modprobe explicitly. SUID helper that auto-creates + # /dev/nvidia* device nodes for non-root NVML callers. Required for + # nvidia-imex, DCGM, and any tenant-side enumeration that doesn't + # run as root. Today this lands transitively via nvidia-driver-open= + # 595.71.05-1ubuntu1 but pinning makes it deterministic. + # + # NOTE: nvidia-utils-595 was tried here on 2026-05-19 but does not + # exist in the cuda repo for Ubuntu 24.04 — the suffixed + # `nvidia-utils-` series stops at 580. Starting with R595 + # the open-driver branch packaging changed: nvidia-smi and the + # other userspace tools are bundled inside nvidia-driver-open= + # 595.71.05-1ubuntu1 itself (via Depends). Verify-svc-vm.sh's + # `nvidia-smi -q | grep Fabric` gate has been passing on prior + # 595-branch images for exactly this reason; no explicit pin needed. + nvidia-modprobe=595.71.05-1ubuntu1 + # Data Center GPU Manager (DCGM) v4. Per FM User Guide §"NVSwitch + # Errors On DGX B200/B300 and NVIDIA HGX B200/B300 Systems": + # "NVSwitch SXID errors are no longer applicable to DGX B200/B300 + # and NVIDIA HGX B200/B300 systems. DCGM now interfaces with a + # library called NVIDIA Switch Device Manager (NVSDM) to fetch + # errors related to NVSwitch." + # libnvsdm above gives us the library; DCGM is the consumer that + # surfaces those errors as queryable health/telemetry. Without DCGM, + # NVSwitch error visibility on B200 stops at FM/NVLSM syslog lines. + # The -cuda12 variant matches our R595 driver branch (R595 → CUDA 13 + # toolkit, but DCGM-cuda12 is forward-compatible per NVIDIA's matrix). + # + # NOTE: an earlier draft of this file also listed + # datacenter-gpu-manager-4-config, but that package does not exist + # in the NVIDIA cuda apt repo. The actual DCGM 4 package layout is: + # datacenter-gpu-manager-4-core shared base + # datacenter-gpu-manager-4-cuda{11,12,13} CUDA-version-specific + # binary; pulls -core + # transitively. Ships + # the systemd unit + # (nvidia-dcgm.service), + # dcgmi CLI, and default + # config files in + # /etc/nvidia-dcgm/. + # datacenter-gpu-manager-4-cuda-all meta pulling all variants + # datacenter-gpu-manager-4-dev development headers + # datacenter-gpu-manager-4-multinode-* DCGM multinode shipping + # datacenter-gpu-manager-4-proprietary-* closed-source variant + # Installing -cuda12 alone gets the full daemon + tooling with no + # separate config package required. + datacenter-gpu-manager-4-cuda12 + # lshw provides `lshw` and `vpddecode`. Per FM User Guide §"Additional + # Steps for NVIDIA HGX B200/B300 Systems": "Query the VPD information + # using the lspci -vvs or vpddecode command and identify the four PF + # functions you want." Used to distinguish CX7 NVLink-management + # bridge LPFs from CX7 NICs by their SMDL=SW_MNG VPD field. Today we + # rely on `lspci -vvs` (pciutils, already installed); having + # `vpddecode` available makes ad-hoc triage faster when the BDF + # discovery path needs hand-debugging. + lshw + docker.io + python3-minimal + python3-pip + curl + iputils-ping + pciutils + libcurl4t64 + libxml2 + libxmlsec1-openssl + pciutils RemoveFiles=/etc/issue RemoveFiles=/etc/issue.net diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.repart-debug/10-root.conf b/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.repart-debug/10-root.conf index 45c4011117..d01f42b616 100644 --- a/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.repart-debug/10-root.conf +++ b/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.repart-debug/10-root.conf @@ -2,4 +2,12 @@ Type=root Format=ext4 CopyFiles=/ -Minimize=guess +# `Minimize=guess` chronically under-sizes the debug rootfs once the +# NVIDIA 595 stack (driver-open + fabricmanager + nscq + persistenced + +# nv-attestation-sdk) lands in /usr — the build either fails with +# `no space left on device` mid-mkosi or produces a qcow2 whose +# rootfs runs out of space at first boot. Pin to a fixed 12 GiB so +# the debug variant has headroom for the full B200 CC userspace. +Minimize=off +SizeMinBytes=12G +SizeMaxBytes=12G diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/etc/modules-load.d/nvlink-fabric.conf b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/etc/modules-load.d/nvlink-fabric.conf new file mode 100644 index 0000000000..7fbdd7fff7 --- /dev/null +++ b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/etc/modules-load.d/nvlink-fabric.conf @@ -0,0 +1,3 @@ +# Required by nv-fabricmanager NVL5+ subnet management path +# (nvidia-fabricmanager-start.sh checks lsmod for ib_umad and exits if missing) +ib_umad diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/lib/systemd/system/nvidia-imex.service.d/override.conf b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/lib/systemd/system/nvidia-imex.service.d/override.conf new file mode 100644 index 0000000000..8d59979721 --- /dev/null +++ b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/lib/systemd/system/nvidia-imex.service.d/override.conf @@ -0,0 +1,38 @@ +[Unit] + +[Service] +# Gate on actual NVIDIA GPU PCI presence. Mirrors the pattern already used +# by nvidia-persistenced.service.d / nvidia-fabricmanager.service.d / +# nvidia-cdi-refresh.service.d so the unit is harmless on non-GPU VMs. +# +# Why this is needed for the B200 Service VM specifically: +# The qemu-shared-nvswitch SVM is GPU-LESS by design (it owns only the +# four CX7 LPFs that drive the NVSwitch fabric, not the eight B200 GPUs +# which are passed straight through to tenant VMs). nvidia-imex however +# tries to open() /dev/nvidiactl at startup — which on a GPU-less VM +# does not exist (no nvidia.ko loaded, no /dev/nvidia* nodes). The +# daemon then fails with NV_ERR_OPERATING_SYSTEM ("Failed to allocate +# handle to NVIDIA GPU driver") and exits. +# +# That failure on its own would be tolerable, except the upstream +# nvidia-imex.service is Type=forking + TimeoutStartSec=infinity. The +# forking handshake gets stuck because the child exits BEFORE signaling +# the parent that it daemonized successfully, so the parent hangs in +# sigtimedwait() forever and `systemctl start nvidia-imex.service` +# never returns. (Observed first on 2026-05-20: svc-vm-bootstrap.sh +# §0c hung 5+ min waiting on the start; verbose log showed the +# NV_ERR_OPERATING_SYSTEM, /proc/PID/wchan = do_sigtimedwait.) +# +# ExecCondition=/usr/local/bin/check-nvidia-gpu skips the unit cleanly +# on any VM that doesn't have a 10de:* PCI device — i.e. the SVM — +# the same way nvidia-fabricmanager already does. Tenant VMs (which +# DO have GPUs) still start nvidia-imex normally. +ExecCondition=/usr/local/bin/check-nvidia-gpu + +# Belt-and-suspenders: cap any future forking-handshake hang at 2 min +# instead of inheriting the upstream `TimeoutStartSec=infinity`. Even +# on a real GPU node, an indefinite hang here would mask a real +# config error (e.g. malformed nodes_config.cfg) and stall the whole +# unit dependency graph. 120s is generous enough for the legitimate +# case (driver init + IMEX cluster bootstrap) without being unbounded. +TimeoutStartSec=120 diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/lib/systemd/system/nvidia-persistenced.service.d/override.conf b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/lib/systemd/system/nvidia-persistenced.service.d/override.conf index c81de87569..d30225d48f 100644 --- a/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/lib/systemd/system/nvidia-persistenced.service.d/override.conf +++ b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/lib/systemd/system/nvidia-persistenced.service.d/override.conf @@ -3,6 +3,10 @@ After=nvidia-fabricmanager.service [Service] ExecCondition=/usr/local/bin/check-nvidia-gpu +# Block daemon startup until every visible GPU has fabric.state=Completed. +# See /usr/local/bin/wait-nvlink-fabric.sh for the rationale (B200 CC race). +ExecStartPre=/usr/local/bin/wait-nvlink-fabric.sh + ExecStart= ExecStart=/usr/bin/nvidia-persistenced --user nvidia-persistenced --uvm-persistence-mode --verbose ExecStartPost=/usr/bin/nvidia-smi conf-compute -srs 1 diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/local/bin/wait-nvlink-fabric.sh b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/local/bin/wait-nvlink-fabric.sh new file mode 100755 index 0000000000..28eafab87b --- /dev/null +++ b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/local/bin/wait-nvlink-fabric.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# wait-nvlink-fabric.sh — block until NVML reports the NVLink fabric +# registration is COMPLETE on every visible GPU. +# +# Why this exists (HGX B200 + CC/NVLE): +# On B200 in confidential-compute mode, the Service VM Fabric Manager +# programs NVSwitch routing for a partition asynchronously. The handshake +# with the guest GPU happens over in-band NVLink MAD (Probe Request -> +# Probe Response) AFTER fmActivateFabricPartition() returns success. +# nvidia-persistenced therefore races the handshake on guest boot. If it +# registers a GPU before that GPU finishes its handshake, NVML returns +# 0x81 (NVLINK_FABRIC_NOT_READY) and the daemon silently falls back to +# non-UVM persistence — leaving that GPU permanently unable to do NVLink +# P2P. Per the NVIDIA Secure AI Operations Guide ("Ensure that +# Persistence Mode is On"), the only way to recover from a missed +# SPDM/UVM session in CC mode is an FLR, i.e. a full VM restart. So we +# gate persistenced startup on fabric readiness rather than rely on the +# daemons internal retry path (it does not retry on 0x81). +# +# This script is wired in via the nvidia-persistenced service drop-in +# /usr/lib/systemd/system/nvidia-persistenced.service.d/override.conf +# as ExecStartPre. It exits non-zero on timeout so the daemon fails loud +# rather than silently degrading. + +set -euo pipefail + +TIMEOUT_SEC="${WAIT_FABRIC_TIMEOUT:-180}" +POLL_SEC="${WAIT_FABRIC_POLL:-2}" + +log() { printf "[wait-nvlink-fabric] %s\n" "$*" >&2; } + +deadline=$(( $(date +%s) + TIMEOUT_SEC )) +attempt=0 + +while :; do + attempt=$((attempt+1)) + states=$(nvidia-smi --query-gpu=index,uuid,fabric.state,fabric.status \ + --format=csv,noheader 2>/dev/null || true) + + if [[ -z "$states" ]]; then + log "attempt $attempt: nvidia-smi returned empty (driver not ready yet)" + else + not_ready=$(echo "$states" | awk -F"," " + BEGIN { n = 0 } + { + gsub(/^ +| +\$/, \"\", \$3) + gsub(/^ +| +\$/, \"\", \$4) + if (\$3 != \"Completed\" || \$4 != \"Success\") n++ + } + END { print n+0 }") + + if (( not_ready == 0 )); then + log "all GPUs fabric ready (attempt $attempt)" + echo "$states" | sed "s/^/[wait-nvlink-fabric] /" >&2 + exit 0 + fi + log "attempt $attempt: $not_ready GPU(s) fabric not yet Completed/Success" + fi + + if (( $(date +%s) >= deadline )); then + log "FAIL: NVLink fabric did not become ready within ${TIMEOUT_SEC}s" + log "current per-GPU state:" + echo "${states:-}" | sed "s/^/[wait-nvlink-fabric] /" >&2 + exit 1 + fi + + sleep "$POLL_SEC" +done diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/src/fmctl-probe/fmctl-probe.cpp b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/src/fmctl-probe/fmctl-probe.cpp new file mode 100644 index 0000000000..7296770166 --- /dev/null +++ b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/src/fmctl-probe/fmctl-probe.cpp @@ -0,0 +1,407 @@ +// fmctl-probe: minimal client for the NVIDIA Fabric Manager SDK. +// +// Purpose +// ------- +// Phase-0 spike for the Service-VM model. Validates that an FM running in +// FABRIC_MODE=1 inside the Service VM: +// 1. accepts FM-SDK calls on its TCP port (default 127.0.0.1:6666), +// 2. enumerates the supported "shared NVSwitch" partition catalogue, +// 3. activates / deactivates a chosen partition. +// +// Build / install +// --------------- +// The CANONICAL build path is image-time, NOT runtime. This source is +// vendored into cloud-api-adaptor at: +// src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/src/fmctl-probe/fmctl-probe.cpp +// and compiled by mkosi.postinst inside the rootfs (chroot ${BUILDROOT}) +// against the image's own libnvfm from nvidia-fabricmanager-dev. The +// resulting /usr/local/bin/fmctl-probe ships baked into the podvm qcow2. +// The two copies (this one and the CAA one) MUST stay byte-identical; +// run-podvm-build / 04-build-podvm-locally.sh do not auto-sync them. +// verify-svc-vm.sh enforces presence of /usr/local/bin/fmctl-probe and the +// `resolve` subcommand, so an out-of-date or stripped image fails closed. +// +// Manual (host- or SVM-side rebuild for ad-hoc debugging only): +// g++ -std=c++17 -O2 fmctl-probe.cpp -lnvfm -o fmctl-probe +// +// Usage +// ----- +// fmctl-probe list # dump partition catalogue +// fmctl-probe activate # activate partition +// fmctl-probe deactivate # deactivate partition +// fmctl-probe resolve # match by FM-reported pciBusId +// fmctl-probe resolve-by-physids # match by FM-reported physicalId +// +// `resolve` works when FM has GPU BDF info populated (i.e. in single-host +// non-FABRIC_MODE setups where FM and the GPUs share an OS instance and the +// NVIDIA driver is loaded locally). In our qemu-shared-nvswitch SVM topology +// FM runs in FABRIC_MODE=1 with NO GPUs in its OS (the GPUs live in tenant +// VMs), so fmGetSupportedFabricPartitions().gpuInfo[].pciBusId is empty for +// every entry and `resolve` ALWAYS returns "no match". Use +// `resolve-by-physids` instead in that case: pass a comma-separated list of +// physicalId integers (the host computes them from the GPU PCI BDFs sorted +// by bus number, the canonical B200 HGX baseboard mapping) and we match +// against fmGetSupportedFabricPartitions().gpuInfo[].physicalId, which IS +// populated by FM regardless of who owns the GPUs. +// +// Both resolve flavors exit 0 with the id on stdout, 2 on no match, +// 3 on ambiguous. +// +// Override target with FM_ADDR env var, e.g. FM_ADDR=127.0.0.1:6666. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const char* fmReturnStr(fmReturn_t r) { + switch (r) { + case FM_ST_SUCCESS: return "SUCCESS"; + case FM_ST_BADPARAM: return "BADPARAM"; + case FM_ST_GENERIC_ERROR: return "GENERIC_ERROR"; + case FM_ST_NOT_SUPPORTED: return "NOT_SUPPORTED"; + case FM_ST_UNINITIALIZED: return "UNINITIALIZED"; + case FM_ST_TIMEOUT: return "TIMEOUT"; + case FM_ST_VERSION_MISMATCH: return "VERSION_MISMATCH"; + case FM_ST_IN_USE: return "IN_USE"; + case FM_ST_NOT_CONFIGURED: return "NOT_CONFIGURED"; + case FM_ST_CONNECTION_NOT_VALID: return "CONNECTION_NOT_VALID"; + case FM_ST_NVLINK_ERROR: return "NVLINK_ERROR"; + case FM_ST_RESOURCE_BAD: return "RESOURCE_BAD"; + case FM_ST_RESOURCE_IN_USE: return "RESOURCE_IN_USE"; + case FM_ST_RESOURCE_NOT_IN_USE: return "RESOURCE_NOT_IN_USE"; + case FM_ST_RESOURCE_EXHAUSTED: return "RESOURCE_EXHAUSTED"; + case FM_ST_RESOURCE_NOT_READY: return "RESOURCE_NOT_READY"; + case FM_ST_PARTITION_EXISTS: return "PARTITION_EXISTS"; + case FM_ST_PARTITION_ID_IN_USE: return "PARTITION_ID_IN_USE"; + case FM_ST_PARTITION_ID_NOT_IN_USE: return "PARTITION_ID_NOT_IN_USE"; + case FM_ST_NOT_READY: return "NOT_READY"; + default: return "UNKNOWN"; + } +} + +static fmReturn_t connectFm(fmHandle_t* handle) { + const char* addr = std::getenv("FM_ADDR"); + if (!addr || !*addr) addr = "127.0.0.1:6666"; + + fmConnectParams_t params{}; + params.version = fmConnectParams_version; + std::snprintf(params.addressInfo, sizeof(params.addressInfo), "%s", addr); + params.timeoutMs = 5000; + params.addressIsUnixSocket = 0; + params.addressType = NV_FM_API_ADDR_TYPE_INET; + + fmReturn_t r = fmConnect(¶ms, handle); + if (r != FM_ST_SUCCESS) { + std::fprintf(stderr, "fmConnect(%s) failed: %s (%d)\n", addr, fmReturnStr(r), r); + } else { + // To stderr, not stdout: stdout is reserved for machine-readable + // output of subcommands (e.g. resolve-by-physids prints just the + // partition id on stdout, so any banner here would be parsed as + // part of the id by the calling shell `pid=$(fmctl-probe ...)`). + std::fprintf(stderr, "[fmctl] connected to %s\n", addr); + } + return r; +} + +static int doList(fmHandle_t h) { + fmFabricPartitionList_t list{}; + list.version = fmFabricPartitionList_version; + fmReturn_t r = fmGetSupportedFabricPartitions(h, &list); + if (r != FM_ST_SUCCESS) { + std::fprintf(stderr, "fmGetSupportedFabricPartitions failed: %s (%d)\n", fmReturnStr(r), r); + return 1; + } + + std::printf("supported partitions: %u (max %u on this platform)\n", + list.numPartitions, list.maxNumPartitions); + for (unsigned i = 0; i < list.numPartitions; ++i) { + const auto& p = list.partitionInfo[i]; + std::printf(" partition id=%u active=%u numGpus=%u\n", + p.partitionId, p.isActive, p.numGpus); + for (unsigned g = 0; g < p.numGpus; ++g) { + const auto& gpu = p.gpuInfo[g]; + std::printf(" gpu physicalId=%u uuid=%s pci=%s nvlinks=%u/%u\n", + gpu.physicalId, gpu.uuid, gpu.pciBusId, + gpu.numNvLinksAvailable, gpu.maxNumNvLinks); + } + } + return 0; +} + +static int doActivate(fmHandle_t h, fmFabricPartitionId_t id) { + fmReturn_t r = fmActivateFabricPartition(h, id); + std::printf("fmActivateFabricPartition(%u) -> %s (%d)\n", id, fmReturnStr(r), r); + return r == FM_ST_SUCCESS ? 0 : 1; +} + +static int doDeactivate(fmHandle_t h, fmFabricPartitionId_t id) { + fmReturn_t r = fmDeactivateFabricPartition(h, id); + std::printf("fmDeactivateFabricPartition(%u) -> %s (%d)\n", id, fmReturnStr(r), r); + return r == FM_ST_SUCCESS ? 0 : 1; +} + +// Normalize a PCI BDF string to canonical form: "DDDDDDDD:BB:DD.F" (lowercase). +// Accepts the four shapes that show up in practice: +// "c0:00.0" -- bus:dev.fn (domain implicit zero); operator typed +// "0000:c0:00.0" -- 4-hex-digit domain; some Linux tools +// "00000000:c0:00.0" -- 8-hex-digit domain; what NVML/FM-SDK returns +// "00000000:C0:00.0" -- same with uppercase bus +// Returns empty string if the input is malformed. Used by doResolve() so that +// operator-typed BDFs (`--gpus c0:00.0,d8:00.0`) compare equal to FM-reported +// BDFs (`fmFabricPartitionGpuInfo_t::pciBusId == "00000000:C0:00.0"`) regardless +// of casing or domain prefix. +static std::string normalizeBdf(const std::string& in) { + unsigned dom = 0, bus = 0, dev = 0, fn = 0; + int n = 0; + if (std::sscanf(in.c_str(), "%x:%x:%x.%x%n", &dom, &bus, &dev, &fn, &n) == 4 + && n == static_cast(in.size())) { + // domain:bus:dev.fn given (any width domain) + } else if (std::sscanf(in.c_str(), "%x:%x.%x%n", &bus, &dev, &fn, &n) == 3 + && n == static_cast(in.size())) { + dom = 0; // domain omitted; treat as zero per Linux PCI convention + } else { + return ""; + } + char buf[32]; + std::snprintf(buf, sizeof(buf), "%08x:%02x:%02x.%x", dom, bus, dev, fn); + return buf; +} + +// Resolve a comma-separated BDF list to the unique partition id whose GPU +// PCI BDF set is exactly the input set. This implements the runtime side of +// the "operator declares GPUs, FM picks the partition" contract documented +// in docs/PARTITION-MAPPING.md. +// +// Exit codes: +// 0 unique match; partition id printed on stdout +// 1 fmGetSupportedFabricPartitions() failed (unrecoverable) +// 2 no supported partition matches the requested BDF set; or input parse fail +// 3 more than one partition matches (this should never happen on a Blackwell +// baseboard since each (numGpus, gpu-set) combination is unique, but we +// fail loud rather than silently activate the first hit) +static int doResolve(fmHandle_t h, const std::string& bdfsCsv) { + fmFabricPartitionList_t list{}; + list.version = fmFabricPartitionList_version; + fmReturn_t r = fmGetSupportedFabricPartitions(h, &list); + if (r != FM_ST_SUCCESS) { + std::fprintf(stderr, + "fmGetSupportedFabricPartitions failed: %s (%d)\n", + fmReturnStr(r), r); + return 1; + } + + // Parse + normalize the requested BDF list into a sorted set. + std::set wanted; + { + std::stringstream ss(bdfsCsv); + std::string tok; + while (std::getline(ss, tok, ',')) { + tok.erase(std::remove_if(tok.begin(), tok.end(), + [](unsigned char c){ return std::isspace(c); }), + tok.end()); + if (tok.empty()) continue; + std::string normalized = normalizeBdf(tok); + if (normalized.empty()) { + std::fprintf(stderr, + "resolve: cannot parse BDF '%s' " + "(expected DDDD:BB:DD.F or BB:DD.F)\n", + tok.c_str()); + return 2; + } + wanted.insert(normalized); + } + } + if (wanted.empty()) { + std::fprintf(stderr, "resolve: empty BDF list\n"); + return 2; + } + + // Walk every supported partition, comparing its GPU BDF set against `wanted`. + std::vector matches; + for (unsigned i = 0; i < list.numPartitions; ++i) { + const auto& p = list.partitionInfo[i]; + if (p.numGpus != wanted.size()) continue; // fast reject on size + std::set have; + for (unsigned g = 0; g < p.numGpus; ++g) { + std::string normalized = normalizeBdf(p.gpuInfo[g].pciBusId); + if (normalized.empty()) continue; + have.insert(normalized); + } + if (have == wanted) { + matches.push_back(p.partitionId); + } + } + + if (matches.empty()) { + std::fprintf(stderr, + "resolve: no supported partition matches BDF set {"); + bool first = true; + for (const auto& b : wanted) { + std::fprintf(stderr, "%s%s", first ? "" : ",", b.c_str()); + first = false; + } + std::fprintf(stderr, + "} -- check `fmctl-probe list` for the supported partition catalogue.\n"); + return 2; + } + if (matches.size() > 1) { + std::fprintf(stderr, + "resolve: AMBIGUOUS -- %zu partitions match BDF set:", + matches.size()); + for (unsigned id : matches) std::fprintf(stderr, " %u", id); + std::fprintf(stderr, + "\n(this is unexpected on a Blackwell baseboard; report to NVIDIA.)\n"); + return 3; + } + + // stdout: just the partition id, machine-readable for shell wrappers. + std::printf("%u\n", matches[0]); + return 0; +} + +// Resolve a comma-separated physicalId list to the unique partition id whose +// GPU physicalId set is exactly the input. Used in FABRIC_MODE=1 / shared +// NVSwitch topologies where FM has no local GPUs and so pciBusId is empty +// in fmGetSupportedFabricPartitions(); physicalId is still populated. +// +// Exit codes mirror doResolve(): +// 0 unique match; partition id printed on stdout +// 1 fmGetSupportedFabricPartitions() failed (unrecoverable) +// 2 no supported partition matches the requested physicalId set; or parse fail +// 3 more than one partition matches (should not happen on Blackwell) +static int doResolveByPhysIds(fmHandle_t h, const std::string& idsCsv) { + fmFabricPartitionList_t list{}; + list.version = fmFabricPartitionList_version; + fmReturn_t r = fmGetSupportedFabricPartitions(h, &list); + if (r != FM_ST_SUCCESS) { + std::fprintf(stderr, + "fmGetSupportedFabricPartitions failed: %s (%d)\n", + fmReturnStr(r), r); + return 1; + } + + // Parse the requested physicalId list into a sorted set. + std::set wanted; + { + std::stringstream ss(idsCsv); + std::string tok; + while (std::getline(ss, tok, ',')) { + tok.erase(std::remove_if(tok.begin(), tok.end(), + [](unsigned char c){ return std::isspace(c); }), + tok.end()); + if (tok.empty()) continue; + char* end = nullptr; + unsigned long v = std::strtoul(tok.c_str(), &end, 10); + if (!end || *end != '\0') { + std::fprintf(stderr, + "resolve-by-physids: cannot parse physicalId '%s' " + "(expected integer)\n", + tok.c_str()); + return 2; + } + wanted.insert(static_cast(v)); + } + } + if (wanted.empty()) { + std::fprintf(stderr, "resolve-by-physids: empty id list\n"); + return 2; + } + + std::vector matches; + for (unsigned i = 0; i < list.numPartitions; ++i) { + const auto& p = list.partitionInfo[i]; + if (p.numGpus != wanted.size()) continue; + std::set have; + for (unsigned g = 0; g < p.numGpus; ++g) { + have.insert(p.gpuInfo[g].physicalId); + } + if (have == wanted) { + matches.push_back(p.partitionId); + } + } + + if (matches.empty()) { + std::fprintf(stderr, + "resolve-by-physids: no supported partition matches physicalId set {"); + bool first = true; + for (unsigned id : wanted) { + std::fprintf(stderr, "%s%u", first ? "" : ",", id); + first = false; + } + std::fprintf(stderr, + "} -- check `fmctl-probe list` for the supported partition catalogue.\n"); + return 2; + } + if (matches.size() > 1) { + std::fprintf(stderr, + "resolve-by-physids: AMBIGUOUS -- %zu partitions match physicalId set:", + matches.size()); + for (unsigned id : matches) std::fprintf(stderr, " %u", id); + std::fprintf(stderr, + "\n(this is unexpected on a Blackwell baseboard; report to NVIDIA.)\n"); + return 3; + } + + std::printf("%u\n", matches[0]); + return 0; +} + +static void usage(const char* argv0) { + std::fprintf(stderr, + "Usage: %s [args]\n" + "Commands:\n" + " list enumerate supported partitions\n" + " activate activate partition \n" + " deactivate deactivate partition \n" + " resolve match by FM-reported pciBusId\n" + " (works only when FM has local GPUs)\n" + " resolve-by-physids match by FM-reported physicalId\n" + " (use in FABRIC_MODE=1 / shared NVSwitch\n" + " -- pciBusId is empty in that mode)\n" + "Both resolve flavors: exit 0 + id on stdout, 2 if no match, 3 if ambiguous.\n" + "Environment:\n" + " FM_ADDR FM SDK address (default 127.0.0.1:6666)\n", + argv0); +} + +int main(int argc, char** argv) { + if (argc < 2) { usage(argv[0]); return 2; } + std::string cmd = argv[1]; + + fmReturn_t init = fmLibInit(); + if (init != FM_ST_SUCCESS) { + std::fprintf(stderr, "fmLibInit failed: %s (%d)\n", fmReturnStr(init), init); + return 1; + } + + fmHandle_t h = nullptr; + if (connectFm(&h) != FM_ST_SUCCESS) { fmLibShutdown(); return 1; } + + int rc = 0; + if (cmd == "list") { + rc = doList(h); + } else if (cmd == "activate" && argc >= 3) { + rc = doActivate(h, static_cast(std::atoi(argv[2]))); + } else if (cmd == "deactivate" && argc >= 3) { + rc = doDeactivate(h, static_cast(std::atoi(argv[2]))); + } else if (cmd == "resolve" && argc >= 3) { + rc = doResolve(h, std::string(argv[2])); + } else if (cmd == "resolve-by-physids" && argc >= 3) { + rc = doResolveByPhysIds(h, std::string(argv[2])); + } else { + usage(argv[0]); + rc = 2; + } + + fmDisconnect(h); + fmLibShutdown(); + return rc; +} diff --git a/src/cloud-api-adaptor/podvm/Dockerfile.podvm_binaries.ubuntu b/src/cloud-api-adaptor/podvm/Dockerfile.podvm_binaries.ubuntu index ca7cb06dac..ec62bb4199 100644 --- a/src/cloud-api-adaptor/podvm/Dockerfile.podvm_binaries.ubuntu +++ b/src/cloud-api-adaptor/podvm/Dockerfile.podvm_binaries.ubuntu @@ -17,6 +17,8 @@ ARG CUSTOM_GC_BINARIES="" ARG AA_FEATURES="" ARG GUEST_COMPONENTS_REF="" ARG GUEST_COMPONENTS_REPO="https://github.com/confidential-containers/guest-components.git" +ARG NVAT_REPO="https://github.com/NVIDIA/attestation-sdk.git" +ARG NVAT_TAG="2026.03.02" ARG DEBIAN_FRONTEND=noninteractive RUN set -e; \ if [ -n "${CUSTOM_GC_BINARIES}" ] && [ -z "${GUEST_COMPONENTS_REF}" ]; then \ @@ -26,7 +28,8 @@ RUN set -e; \ if [ -n "${CUSTOM_GC_BINARIES}" ] && [ -n "${GUEST_COMPONENTS_REF}" ]; then \ apt-get update && \ apt-get install -y --no-install-recommends \ - protobuf-compiler pkg-config clang libssl-dev libtss2-dev && \ + protobuf-compiler pkg-config clang libclang-dev libssl-dev libtss2-dev \ + cmake libcurl4-openssl-dev libxml2-dev libxmlsec1-dev libxmlsec1-openssl && \ apt-get clean && rm -rf /var/lib/apt/lists/* && \ mkdir -p /build/gc && cd /build/gc && \ git init && \ @@ -34,7 +37,25 @@ RUN set -e; \ git fetch --depth=1 origin "${GUEST_COMPONENTS_REF}" && \ git reset --hard FETCH_HEAD; \ fi +# Build NVIDIA Attestation SDK (libnvat) from source so the nvidia-attester +# feature can link against it. The AA binary will dynamically link libnvat.so, +# which must also be present in the final PodVM image at runtime. +# Installs its own build deps so this works even without CUSTOM_GC_BINARIES. +RUN set -e; \ + if echo "${AA_FEATURES}" | grep -q "nvidia-attester"; then \ + apt-get update && \ + apt-get install -y --no-install-recommends cmake libssl-dev libcurl4-openssl-dev \ + libxml2-dev libxmlsec1-dev libxmlsec1-openssl && \ + apt-get clean && rm -rf /var/lib/apt/lists/* && \ + git clone --depth 1 --branch "${NVAT_TAG}" "${NVAT_REPO}" /build/nvat && \ + cd /build/nvat/nv-attestation-sdk-cpp && \ + cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_INSTALL_LIBDIR=lib && \ + cmake --build build && \ + cmake --install build && \ + ldconfig; \ + fi COPY cloud-api-adaptor/podvm/build-guest-components.sh /build/ +ENV NVAT_USE_SYSTEM_LIB=1 RUN /build/build-guest-components.sh "${CUSTOM_GC_BINARIES}" "${AA_FEATURES}" # ubuntu:24.04 @@ -146,5 +167,14 @@ RUN for bin in /tmp/gc-overrides/*; do \ install -m0755 "$bin" /src/cloud-api-adaptor/podvm/files/usr/local/bin/"$(basename "$bin")"; \ done; true +# Copy libnvat shared library if it was built (needed at runtime by attestation-agent +# when compiled with nvidia-attester feature). Uses a mount instead of COPY so +# builds without nvidia-attester don't fail on an empty glob. +RUN --mount=from=gc_builder,src=/usr/lib/,dst=/tmp/gc-lib/,readonly \ + if ls /tmp/gc-lib/libnvat* 1>/dev/null 2>&1; then \ + mkdir -p /src/cloud-api-adaptor/podvm/files/usr/lib/ && \ + cp /tmp/gc-lib/libnvat* /src/cloud-api-adaptor/podvm/files/usr/lib/; \ + fi; true + FROM scratch COPY --from=podvm_binaries_builder /src/cloud-api-adaptor/podvm/files / diff --git a/src/cloud-api-adaptor/podvm/build-guest-components.sh b/src/cloud-api-adaptor/podvm/build-guest-components.sh index 2f586d4352..3d9de4f419 100755 --- a/src/cloud-api-adaptor/podvm/build-guest-components.sh +++ b/src/cloud-api-adaptor/podvm/build-guest-components.sh @@ -30,6 +30,10 @@ for bin in "${BINS[@]}"; do exit 1 fi cd /build/gc/attestation-agent/attestation-agent + # Refresh lockfile so optional feature deps (e.g. nv-attestation-sdk + # for nvidia-attester) are resolved even if the checked-in Cargo.lock + # was generated without them. + cargo update --workspace cargo build --release --locked --no-default-features \ --features "$AA_FEATURES" --bin ttrpc-aa cp /build/gc/target/release/ttrpc-aa "$OUTDIR/attestation-agent"