diff --git a/.github/workflows/build-podvm-cohere.yaml b/.github/workflows/build-podvm-cohere.yaml
index dfabc84e2e..6c5f23b7fc 100644
--- a/.github/workflows/build-podvm-cohere.yaml
+++ b/.github/workflows/build-podvm-cohere.yaml
@@ -22,10 +22,20 @@ on:
         type: string
         default: "https://github.com/cohere-ai/guest-components.git"
       guest_components_ref:
-        description: "guest-components ref (default: cohere)"
+        description: |
+          guest-components ref (branch, tag, or SHA).
+
+          Default: alhassankhedr/sync-main-to-cohere (head of PR #9).
+          That branch carries upstream main's nvidia-attester rewrite
+          (NVAT SDK based, no `count == 1` guard) and is required for
+          multi-GPU evidence to work end-to-end on 8x B200 hosts. The
+          plain `cohere` branch still has the old NVML-based attester
+          which silently produces empty evidence on 2+ GPU systems
+          (mod a sed `s/count == 1/count >= 1/` patch the podvm-mkosi
+          Dockerfile applies). Switch back to `cohere` after PR #9 merges.
         required: false
         type: string
-        default: "cohere"
+        default: "alhassankhedr/sync-main-to-cohere"
       custom_gc_binaries:
         description: "guest-components binaries to build from source"
         required: false
@@ -46,6 +56,11 @@ on:
         required: false
         type: boolean
         default: false
+      b200_cc_drivers:
+        description: "Install NVIDIA 595.71.05 open driver (enables Confidential Computing on multi-GPU B200). EXPERIMENTAL"
+        required: false
+        type: boolean
+        default: false
 
 permissions:
   id-token: write      # OIDC token for build provenance attestation
@@ -70,11 +85,13 @@ jobs:
       image_name_debug: ${{ steps.compute.outputs.image_name_debug }}
       image_tag_release: ${{ steps.compute.outputs.image_tag_release }}
       image_tag_debug: ${{ steps.compute.outputs.image_tag_debug }}
+      b200_cc_drivers: ${{ steps.compute.outputs.b200_cc_drivers }}
     steps:
       - name: Compute tags and image names
         id: compute
         env:
           DISTRO: ${{ inputs.distro || 'ubuntu' }}
+          B200_CC_DRIVERS: ${{ inputs.b200_cc_drivers && 'true' || 'false' }}
         run: |
           if [[ "$GITHUB_REF" == refs/tags/podvm-v* ]]; then
             TAG="${GITHUB_REF#refs/tags/podvm-}"
@@ -84,10 +101,15 @@ jobs:
             REPLACE_IMAGE="true"
           fi
           TAG="${TAG//./-}"
+          # Suffix CC-driver builds so they never collide with standard images
+          if [ "$B200_CC_DRIVERS" = "true" ]; then
+            TAG="${TAG}-cc595"
+          fi
           {
             echo "tag=$TAG"
             echo "distro=$DISTRO"
             echo "replace_image=$REPLACE_IMAGE"
+            echo "b200_cc_drivers=$B200_CC_DRIVERS"
             echo "image_name_release=podvm-${DISTRO}-${TEE_PLATFORM}-release-${TAG}"
             echo "image_name_debug=podvm-${DISTRO}-${TEE_PLATFORM}-debug-${TAG}"
             echo "image_tag_release=${TAG}-${DISTRO}-release"
@@ -176,7 +198,7 @@ jobs:
           PODVM_DISTRO: ${{ needs.meta.outputs.distro }}
           AA_FEATURES: ${{ inputs.aa_features || 'bin,ttrpc,kbs,coco_as,rust-crypto,tdx-attester,nvidia-attester' }}
           GC_REPO: ${{ inputs.guest_components_repo || 'https://github.com/cohere-ai/guest-components.git' }}
-          GC_REF: ${{ inputs.guest_components_ref || 'cohere' }}
+          GC_REF: ${{ inputs.guest_components_ref || 'alhassankhedr/sync-main-to-cohere' }}
           GC_CUSTOM_BINARIES: ${{ inputs.custom_gc_binaries || 'attestation-agent,api-server-rest' }}
         run: |
           MAKE_ARGS=(
@@ -195,6 +217,47 @@ jobs:
           echo "Disk after binaries build:"
           df -h /
 
+      - name: Override NVIDIA driver to 595.71.05 (B200 multi-GPU CC)
+        if: needs.meta.outputs.b200_cc_drivers == 'true'
+        working-directory: src/cloud-api-adaptor/podvm-mkosi
+        run: |
+          set -euo pipefail
+          CONF=mkosi.presets/system/mkosi.conf.d/ubuntu.conf
+          # The 595 branch only ships the unversioned `nvidia-driver-open`
+          # metapackage in NVIDIA's CUDA repo (no `nvidia-driver-595-open`).
+          # Match by package name only so this survives future 580.x.y bumps.
+          sed -i -E \
+            -e 's|^([[:space:]]*)nvidia-driver-580-open=.*|\1nvidia-driver-open=595.71.05-1ubuntu1|' \
+            -e 's|^([[:space:]]*)nvidia-persistenced=.*|\1nvidia-persistenced=595.71.05-1ubuntu1|' \
+            -e 's|^([[:space:]]*)nvidia-fabricmanager=.*|\1nvidia-fabricmanager=595.71.05-1ubuntu1|' \
+            -e 's|^([[:space:]]*)libnvidia-nscq=.*|\1libnvidia-nscq=595.71.05-1ubuntu1|' \
+            "$CONF"
+          echo "----- Updated NVIDIA package pins -----"
+          grep -E '^[[:space:]]*(nvidia|libnvidia)' "$CONF"
+
+      - name: Increase debug root partition for CC595 drivers
+        if: needs.meta.outputs.b200_cc_drivers == 'true' && matrix.profile == 'debug'
+        working-directory: src/cloud-api-adaptor/podvm-mkosi
+        run: |
+          set -euo pipefail
+          CONF=mkosi.presets/system/mkosi.repart-debug/10-root.conf
+          # NVIDIA 595 drivers make the root filesystem too large for
+          # systemd-repart's Minimize=guess estimation, causing mkfs.ext4
+          # "No space left on device" during the build.
+          printf '[Partition]\nType=root\nFormat=ext4\nCopyFiles=/\nMinimize=off\nSizeMinBytes=12G\nSizeMaxBytes=12G\n' > "$CONF"
+          echo "----- Updated repart config -----"
+          cat "$CONF"
+
+      - name: Resolve installed NVIDIA driver version
+        working-directory: src/cloud-api-adaptor/podvm-mkosi
+        run: |
+          set -euo pipefail
+          CONF=mkosi.presets/system/mkosi.conf.d/ubuntu.conf
+          DRIVER_LINE=$(grep -E '^[[:space:]]*nvidia-driver(-580)?-open=' "$CONF" | head -n1)
+          DRIVER_VER=$(printf '%s' "$DRIVER_LINE" | sed -E 's|.*=([0-9]+\.[0-9]+\.[0-9]+).*|\1|')
+          echo "Resolved NVIDIA driver version: $DRIVER_VER"
+          echo "NVIDIA_DRIVER=$DRIVER_VER" >> "$GITHUB_ENV"
+
       - name: Build OS image
         working-directory: src/cloud-api-adaptor/podvm-mkosi
         env:
@@ -265,6 +328,7 @@ jobs:
             --arg distro "$DISTRO" \
             --arg profile "$PROFILE" \
             --arg tee_platform "$TEE_PLATFORM" \
+            --arg nvidia_driver "$NVIDIA_DRIVER" \
             --arg build_date "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
             '$ARGS.named' > /tmp/measurements.json
 
@@ -307,6 +371,7 @@ jobs:
             --annotation "com.cohere.caa.commit=${CAA_COMMIT}" \
             --annotation "com.cohere.caa.version=${GITHUB_REF_NAME}" \
             --annotation "com.cohere.rtmr2=${RTMR2}" \
+            --annotation "com.cohere.nvidia.driver=${NVIDIA_DRIVER}" \
             --format json > oras-output.json
 
           cat oras-output.json
diff --git a/src/cloud-api-adaptor/podvm-mkosi/Dockerfile.mkosi.ubuntu b/src/cloud-api-adaptor/podvm-mkosi/Dockerfile.mkosi.ubuntu
index ee25af5f79..22ab71a337 100644
--- a/src/cloud-api-adaptor/podvm-mkosi/Dockerfile.mkosi.ubuntu
+++ b/src/cloud-api-adaptor/podvm-mkosi/Dockerfile.mkosi.ubuntu
@@ -51,6 +51,25 @@ COPY mkosi.conf.ubuntu /image/mkosi.conf
 
 # Add NVIDIA APT repos to mkosi.skeleton/ so they are present in the image tree
 # *before* package installation. mkosi applies SkeletonTrees before apt-get runs.
+#
+# Three NVIDIA repos are wired in:
+#   1. CUDA repo (developer.download.nvidia.com/compute/cuda)
+#      Provides nvidia-driver-open, nvidia-fabricmanager, libnvidia-nscq,
+#      nvidia-imex, libnvsdm, nvlsm, collectx-bringup, mft, mft-oem,
+#      mft-autocomplete, etc. — the full R595 + nvlink5 stack EXCEPT for
+#      the `ucx` dependency that collectx-bringup pulls in.
+#   2. nvidia-container-toolkit repo (nvidia.github.io/libnvidia-container)
+#      Provides nvidia-container-toolkit and friends.
+#   3. NVIDIA DOCA-Host networking repo (linux.mellanox.com/public/repo/doca)
+#      Provides `ucx` (Unified Communication X) and the matching MOFED
+#      userspace stack. We need it ONLY for `ucx` (so collectx-bringup's
+#      Depends: ucx resolves) and as an alternate source for mft*/
+#      collectx-bringup. The repo is pinned at priority 100 so it acts as
+#      a fallback — packages already available from the cuda repo or from
+#      Ubuntu universe (e.g. rdma-core, ibverbs-utils, libibumad3) are
+#      NOT replaced by the MOFED variants, which would otherwise turn the
+#      image into a MOFED-userspace install. Only ucx (which exists ONLY
+#      in DOCA) and any explicitly-requested package fall through to it.
 RUN mkdir -p /image/mkosi.skeleton/etc/apt/sources.list.d \
              /image/mkosi.skeleton/etc/apt/preferences.d \
              /image/mkosi.skeleton/usr/share/keyrings \
@@ -58,10 +77,14 @@ RUN mkdir -p /image/mkosi.skeleton/etc/apt/sources.list.d \
        -o /image/mkosi.skeleton/usr/share/keyrings/cuda-archive-keyring.gpg \
     && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
        | gpg --dearmor -o /image/mkosi.skeleton/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+    && curl -fsSL https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox \
+       | gpg --dearmor -o /image/mkosi.skeleton/usr/share/keyrings/nvidia-doca-keyring.gpg \
     && echo 'deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/ /' \
        > /image/mkosi.skeleton/etc/apt/sources.list.d/cuda-ubuntu2404-x86_64.list \
     && echo 'deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://nvidia.github.io/libnvidia-container/stable/deb/amd64 /' \
        > /image/mkosi.skeleton/etc/apt/sources.list.d/nvidia-container-toolkit.list \
+    && echo 'deb [signed-by=/usr/share/keyrings/nvidia-doca-keyring.gpg] https://linux.mellanox.com/public/repo/doca/latest/ubuntu24.04/x86_64/ ./' \
+       > /image/mkosi.skeleton/etc/apt/sources.list.d/nvidia-doca-ubuntu2404.list \
     && printf '%s\n' \
        'Package: nvidia-*' \
        'Pin: origin developer.download.nvidia.com' \
@@ -74,7 +97,24 @@ RUN mkdir -p /image/mkosi.skeleton/etc/apt/sources.list.d \
        'Package: cuda-*' \
        'Pin: origin developer.download.nvidia.com' \
        'Pin-Priority: 1001' \
-       > /image/mkosi.skeleton/etc/apt/preferences.d/nvidia-cuda-repo
+       > /image/mkosi.skeleton/etc/apt/preferences.d/nvidia-cuda-repo \
+    && printf '%s\n' \
+       '# Pin the NVIDIA DOCA-Host repo to priority 100. Apt rule for' \
+       '# "Pin-Priority: 100": install ONLY if explicitly requested or as' \
+       '# a dep, never auto-upgrade or replace a package available from' \
+       '# a higher-priority source. Outcome:' \
+       '#   - ucx (only in DOCA)        -> installs from DOCA  ✓ (resolves' \
+       '#                                  collectx-bringup Depends: ucx)' \
+       '#   - rdma-core, ibverbs-utils, libibumad3 (in universe @500 AND' \
+       '#     in DOCA @100)             -> installs from universe ✓' \
+       '#                                  (keeps inbox OFED, not MOFED)' \
+       '#   - collectx-bringup, mft*    -> install from cuda repo (origin' \
+       '#                                  developer.download.nvidia.com,' \
+       '#                                  default priority 500 > 100)' \
+       'Package: *' \
+       'Pin: origin linux.mellanox.com' \
+       'Pin-Priority: 100' \
+       > /image/mkosi.skeleton/etc/apt/preferences.d/nvidia-doca-repo
 
 RUN --security=insecure mkosi --profile=$PROFILE --image-version=$IMAGE_VERSION
 
diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.postinst b/src/cloud-api-adaptor/podvm-mkosi/mkosi.postinst
index 6dc3beb46f..4c93012367 100755
--- a/src/cloud-api-adaptor/podvm-mkosi/mkosi.postinst
+++ b/src/cloud-api-adaptor/podvm-mkosi/mkosi.postinst
@@ -2,6 +2,56 @@
 
 set -euxo pipefail
 
+# ---------------------------------------------------------------------------
+# Distro guard: this image MUST be Ubuntu noble.
+#
+# The Makefile default is `PODVM_DISTRO ?= fedora`, so a bare
+# `make image-debug` silently builds a Fedora qcow2. The Cohere B200 SVM
+# stack assumes Ubuntu noble end-to-end:
+#   - mkosi.presets/system/mkosi.conf.d/ubuntu.conf pins 13 NVIDIA packages
+#     (nvidia-driver-open=595.71.05-1ubuntu1, nvidia-fabricmanager*,
+#     libnvsdm, nvidia-imex, libnvidia-nscq, nvlsm, collectx-bringup, mft*,
+#     rdma-core, ibverbs-utils, libibumad3, datacenter-gpu-manager-4-cuda12,
+#     nvidia-modprobe, nvidia-container-toolkit) from the CUDA + DOCA-Host
+#     apt repos. NONE of these resolve in Fedora's package universe.
+#   - service-vm/scripts/svc-vm-bootstrap.sh runtime install workaround
+#     uses `apt-get download` + `dpkg -x` (no rpm equivalent here).
+#   - service-vm/scripts/verify-svc-vm.sh hard-fail gates use dpkg-query,
+#     ldconfig with /lib/x86_64-linux-gnu/, and apt-style version strings.
+#   - The fmctl-probe bake below links against
+#     /usr/lib/x86_64-linux-gnu/libnvfm.so from nvidia-fabricmanager-dev,
+#     which does not exist in Fedora's NVIDIA repos.
+#
+# A Fedora qcow2 produced by this pipeline would silently lack every B200
+# component and the SVM would only fail at FM startup time -- AFTER the
+# operator has copied a 5 GiB image into place and rebooted. Fail here
+# instead, before mkosi finalizes the disk image.
+# ---------------------------------------------------------------------------
+osr="${BUILDROOT}/etc/os-release"
+if [ -f "$osr" ]; then
+    distro_id=$(awk -F= '$1=="ID" {gsub(/"/,"",$2); print $2; exit}' "$osr")
+    if [ "$distro_id" != "ubuntu" ]; then
+        cat >&2 <<EOF
+[postinst] FAIL: rootfs is ID=${distro_id}, expected ID=ubuntu.
+[postinst]
+[postinst] The Makefile default is \`PODVM_DISTRO ?= fedora\`, so a bare
+[postinst] \`make image-debug\` silently builds a Fedora qcow2 that lacks
+[postinst] the entire B200 NVIDIA stack pinned in
+[postinst] mkosi.presets/system/mkosi.conf.d/ubuntu.conf. Re-invoke with:
+[postinst]
+[postinst]     PODVM_DISTRO=ubuntu make image-debug
+[postinst]
+[postinst] Or use one of the wrapper scripts that sets it for you:
+[postinst]   - /tmp/run-podvm-build.sh                  (host-side captured)
+[postinst]   - host/scripts/04-build-podvm-locally.sh   (fortress, reproducible)
+[postinst]   - host/scripts/run-podvm-build.host.sh     (fortress, verbatim capture)
+[postinst]
+[postinst] (Hit on $(date -u +%Y-%m-%dT%H:%M:%SZ).)
+EOF
+        exit 1
+    fi
+fi
+
 # move issue files away from /etc
 # to allow /run/issue and /run/issue.d to take precedence
 mv "${BUILDROOT}/etc/issue.d" "${BUILDROOT}/usr/lib/issue.d" || true
@@ -42,3 +92,39 @@ if [ -f /usr/lib/systemd/boot/efi/linuxx64.efi.stub ]; then
     cp /usr/lib/systemd/boot/efi/linuxx64.efi.stub \
        "${BUILDROOT}/usr/lib/systemd/boot/efi/linuxx64.efi.stub"
 fi
+
+# ---------------------------------------------------------------------------
+# Bake fmctl-probe (NVIDIA Fabric Manager SDK client) into the SVM image.
+#
+# fmctl-probe is the small C++ utility used by the host-side
+# {activate,deactivate}-partition-by-bdfs.sh wrappers to drive
+# fmActivateFabricPartition / fmDeactivateFabricPartition / and the
+# fmGetSupportedFabricPartitions-based BDF->partition-id resolver.
+# Source is vendored at mkosi.skeleton/usr/src/fmctl-probe/fmctl-probe.cpp
+# (canonical copy lives in fortress: scratch/oci-b200/.../orchestration/scripts/
+# fmctl-probe.cpp -- keep them in sync).
+#
+# We compile inside ${BUILDROOT} via chroot so the binary links against the
+# rootfs's own libnvfm (from nvidia-fabricmanager-dev), guaranteeing ABI parity
+# with the libnvfm.so loaded at runtime. Building from the outer Oracular
+# builder would risk linking against a different libstdc++/libnvfm version.
+#
+# Removed from the image after compile: the source tree (~12 KiB) is build-time
+# only; we don't want a build artifact source dir in production qcow2s.
+# ---------------------------------------------------------------------------
+FMCTL_SRC="${BUILDROOT}/usr/src/fmctl-probe/fmctl-probe.cpp"
+FMCTL_HDR="${BUILDROOT}/usr/include/nv_fm_agent.h"
+if [ -f "${FMCTL_SRC}" ] && [ -f "${FMCTL_HDR}" ]; then
+    echo "[postinst] compiling fmctl-probe inside rootfs against libnvfm"
+    chroot "${BUILDROOT}" /usr/bin/g++ \
+        -std=c++17 -O2 -Wall -Wextra \
+        /usr/src/fmctl-probe/fmctl-probe.cpp \
+        -lnvfm \
+        -o /usr/local/bin/fmctl-probe
+    chroot "${BUILDROOT}" /usr/bin/test -x /usr/local/bin/fmctl-probe
+    rm -rf "${BUILDROOT}/usr/src/fmctl-probe"
+    echo "[postinst] fmctl-probe baked at /usr/local/bin/fmctl-probe"
+elif [ -f "${FMCTL_SRC}" ]; then
+    echo "[postinst] WARN: fmctl-probe source present but nv_fm_agent.h missing;" \
+         "is nvidia-fabricmanager-dev in Packages=? Skipping bake." >&2
+fi
diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.conf.d/ubuntu.conf b/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.conf.d/ubuntu.conf
index ac0c9670cf..63966d123d 100644
--- a/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.conf.d/ubuntu.conf
+++ b/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.conf.d/ubuntu.conf
@@ -12,6 +12,7 @@ Packages=
     linux-image-generic-hwe-24.04
     linux-headers-generic-hwe-24.04
     gcc
+    g++
     make
     kmod
     udev
@@ -25,11 +26,149 @@ Packages=
     iptables
     e2fsprogs
     cryptsetup
-    nvidia-driver-580-open=580.126.20-1ubuntu1
-    nvidia-persistenced=580.126.20-1ubuntu1
-    nvidia-fabricmanager=580.126.20-1
-    libnvidia-nscq=580.126.20-1
+    nvidia-driver-open=595.71.05-1ubuntu1
+    nvidia-persistenced=595.71.05-1ubuntu1
+    nvidia-fabricmanager=595.71.05-1ubuntu1
+    libnvidia-nscq=595.71.05-1ubuntu1
     nvidia-container-toolkit=1.19.0-1
+    nvidia-fabricmanager-dev=595.71.05-1ubuntu1
+    nvlsm
+    # nvlink5-595 metapackage components — installed individually because
+    # the metapackage is deprecated. Per the NVIDIA FM User Guide
+    # §"Installing Fabric Manager / Systems Using Fourth Generation
+    # NVSwitches", the canonical B200/B300 install path used to be:
+    #   sudo apt-get install -V nvidia-open-<branch>
+    #   sudo apt-get install -V nvlink5-<branch>
+    # The nvlink5 metapackage has no files of its own — it just pulls in
+    # a fixed list of components (see
+    # https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/nvlink5-migration.html).
+    # NVIDIA's documented migration is to install those components
+    # explicitly; the dep list of nvlink5_595.71.05-1 is:
+    #   libnvidia-nscq (>=595)                              — pinned above
+    #   libnvsdm (>=595)                                    — added below
+    #   nvidia-fabricmanager (>=595)                        — pinned above
+    #   nvidia-imex (>=595)                                 — added below
+    #   nvidia-dkms-open / nvidia-kernel-open-dkms (>=595)  — via nvidia-driver-open
+    #   libnvidia-compute / nvidia-driver-cuda (>=595)      — via nvidia-driver-open
+    #   nvlsm (>=2025.10.12)                                — pinned above
+    #   collectx-bringup, mft, mft-oem, mft-autocomplete    — INTENTIONALLY OMITTED, see note below
+    #
+    # Why each one matters (FM Guide §Shared NVSwitch Virtualization Model
+    # and §1259 NVSwitch Errors On DGX B200/B300):
+    #   libnvsdm     — NVSwitch Device Manager telemetry library; replaces
+    #                  the SXID error path on B200/B300. DCGM (added
+    #                  below) reads NVSwitch port/ASIC counters through
+    #                  it; without it NVSwitch faults are invisible.
+    #   nvidia-imex  — Internode Memory Exchange daemon. Brokers secure
+    #                  cross-OS-instance shared CUDA memory channels over
+    #                  NVLink. Required for multi-tenant Shared-NVSwitch
+    #                  workloads where NCCL group formation registers
+    #                  memory regions that cross partition boundaries;
+    #                  without IMEX the GPU's secure subsystem fires
+    #                  Xid 170 SECURE Fatal CROSS_CONTAIN. See fortress
+    #                  scratch/oci-b200/.../docs/KNOWN-ISSUES.md §1
+    #                  "mode C" for the symptom this targets.
+    libnvsdm=595.71.05-1ubuntu1
+    nvidia-imex=595.71.05-1ubuntu1
+    # CX7 / NVSwitch firmware-management and telemetry tools. These come
+    # from NVIDIA's CUDA repo, but their `ucx` dependency comes from
+    # NVIDIA's DOCA-Host networking repo. Both repos are wired into the
+    # mkosi.skeleton apt sources by Dockerfile.mkosi.ubuntu (see comments
+    # there for the priority-100 fallback pin that makes this resolve
+    # without dragging in MOFED userspace).
+    #   collectx-bringup  CX7 bringup utilities for the NVLink-management
+    #                     bridge. Depends on ucx (now installable via
+    #                     DOCA repo). Used by FM/NVLSM internal startup
+    #                     scripts to query CX7 SMDL/VPD info.
+    #   mft / mft-oem /   Mellanox Firmware Tools (mst, flint, mlxconfig,
+    #   mft-autocomplete  mlxlink). Required for B200 LPF firmware
+    #                     diagnostics when the CX7 bridge fails to come
+    #                     up. nvlsm's prelaunch script uses mst to enum
+    #                     management ports.
+    collectx-bringup
+    mft
+    mft-oem
+    mft-autocomplete
+    # Userspace OFED bits that the FM User Guide "NVIDIA Software Packages"
+    # section requires on B200/B300 ("OFED or MOFED package is required").
+    # rdma-core pulls in libibverbs1, librdmacm1, libibmad5, libibumad3,
+    # libibnetdisc5, etc. ibverbs-utils gives ibv_devices/ibv_devinfo for
+    # triage. infiniband-diags below already provides ibstatus/ibstat.
+    rdma-core
+    ibverbs-utils
+    infiniband-diags
+    # Pin libibumad3 explicitly. FM User Guide §"Other NVIDIA Software
+    # Packages" calls out libibumad3 by name as a B200/B300 SVM
+    # requirement. Today this lands transitively via infiniband-diags,
+    # but pinning it makes the package set hermetic against any future
+    # transitive-dep churn and lets verify-svc-vm.sh's
+    # /lib/x86_64-linux-gnu/libibumad.so.3 gate be deterministic.
+    libibumad3
+    # Pin nvidia-modprobe explicitly. SUID helper that auto-creates
+    # /dev/nvidia* device nodes for non-root NVML callers. Required for
+    # nvidia-imex, DCGM, and any tenant-side enumeration that doesn't
+    # run as root. Today this lands transitively via nvidia-driver-open=
+    # 595.71.05-1ubuntu1 but pinning makes it deterministic.
+    #
+    # NOTE: nvidia-utils-595 was tried here on 2026-05-19 but does not
+    # exist in the cuda repo for Ubuntu 24.04 — the suffixed
+    # `nvidia-utils-<branch>` series stops at 580. Starting with R595
+    # the open-driver branch packaging changed: nvidia-smi and the
+    # other userspace tools are bundled inside nvidia-driver-open=
+    # 595.71.05-1ubuntu1 itself (via Depends). Verify-svc-vm.sh's
+    # `nvidia-smi -q | grep Fabric` gate has been passing on prior
+    # 595-branch images for exactly this reason; no explicit pin needed.
+    nvidia-modprobe=595.71.05-1ubuntu1
+    # Data Center GPU Manager (DCGM) v4. Per FM User Guide §"NVSwitch
+    # Errors On DGX B200/B300 and NVIDIA HGX B200/B300 Systems":
+    #   "NVSwitch SXID errors are no longer applicable to DGX B200/B300
+    #    and NVIDIA HGX B200/B300 systems. DCGM now interfaces with a
+    #    library called NVIDIA Switch Device Manager (NVSDM) to fetch
+    #    errors related to NVSwitch."
+    # libnvsdm above gives us the library; DCGM is the consumer that
+    # surfaces those errors as queryable health/telemetry. Without DCGM,
+    # NVSwitch error visibility on B200 stops at FM/NVLSM syslog lines.
+    # The -cuda12 variant matches our R595 driver branch (R595 → CUDA 13
+    # toolkit, but DCGM-cuda12 is forward-compatible per NVIDIA's matrix).
+    #
+    # NOTE: an earlier draft of this file also listed
+    # datacenter-gpu-manager-4-config, but that package does not exist
+    # in the NVIDIA cuda apt repo. The actual DCGM 4 package layout is:
+    #   datacenter-gpu-manager-4-core           shared base
+    #   datacenter-gpu-manager-4-cuda{11,12,13} CUDA-version-specific
+    #                                           binary; pulls -core
+    #                                           transitively. Ships
+    #                                           the systemd unit
+    #                                           (nvidia-dcgm.service),
+    #                                           dcgmi CLI, and default
+    #                                           config files in
+    #                                           /etc/nvidia-dcgm/.
+    #   datacenter-gpu-manager-4-cuda-all       meta pulling all variants
+    #   datacenter-gpu-manager-4-dev            development headers
+    #   datacenter-gpu-manager-4-multinode-*    DCGM multinode shipping
+    #   datacenter-gpu-manager-4-proprietary-*  closed-source variant
+    # Installing -cuda12 alone gets the full daemon + tooling with no
+    # separate config package required.
+    datacenter-gpu-manager-4-cuda12
+    # lshw provides `lshw` and `vpddecode`. Per FM User Guide §"Additional
+    # Steps for NVIDIA HGX B200/B300 Systems": "Query the VPD information
+    # using the lspci -vvs or vpddecode command and identify the four PF
+    # functions you want." Used to distinguish CX7 NVLink-management
+    # bridge LPFs from CX7 NICs by their SMDL=SW_MNG VPD field. Today we
+    # rely on `lspci -vvs` (pciutils, already installed); having
+    # `vpddecode` available makes ad-hoc triage faster when the BDF
+    # discovery path needs hand-debugging.
+    lshw
+    docker.io
+    python3-minimal
+    python3-pip
+    curl
+    iputils-ping
+    pciutils
+    libcurl4t64
+    libxml2
+    libxmlsec1-openssl
+    pciutils
 
 RemoveFiles=/etc/issue
 RemoveFiles=/etc/issue.net
diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.repart-debug/10-root.conf b/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.repart-debug/10-root.conf
index 45c4011117..d01f42b616 100644
--- a/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.repart-debug/10-root.conf
+++ b/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.repart-debug/10-root.conf
@@ -2,4 +2,12 @@
 Type=root
 Format=ext4
 CopyFiles=/
-Minimize=guess
+# `Minimize=guess` chronically under-sizes the debug rootfs once the
+# NVIDIA 595 stack (driver-open + fabricmanager + nscq + persistenced +
+# nv-attestation-sdk) lands in /usr — the build either fails with
+# `no space left on device` mid-mkosi or produces a qcow2 whose
+# rootfs runs out of space at first boot. Pin to a fixed 12 GiB so
+# the debug variant has headroom for the full B200 CC userspace.
+Minimize=off
+SizeMinBytes=12G
+SizeMaxBytes=12G
diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/etc/modules-load.d/nvlink-fabric.conf b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/etc/modules-load.d/nvlink-fabric.conf
new file mode 100644
index 0000000000..7fbdd7fff7
--- /dev/null
+++ b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/etc/modules-load.d/nvlink-fabric.conf
@@ -0,0 +1,3 @@
+# Required by nv-fabricmanager NVL5+ subnet management path
+# (nvidia-fabricmanager-start.sh checks lsmod for ib_umad and exits if missing)
+ib_umad
diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/lib/systemd/system/nvidia-imex.service.d/override.conf b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/lib/systemd/system/nvidia-imex.service.d/override.conf
new file mode 100644
index 0000000000..8d59979721
--- /dev/null
+++ b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/lib/systemd/system/nvidia-imex.service.d/override.conf
@@ -0,0 +1,38 @@
+[Unit]
+
+[Service]
+# Gate on actual NVIDIA GPU PCI presence. Mirrors the pattern already used
+# by nvidia-persistenced.service.d / nvidia-fabricmanager.service.d /
+# nvidia-cdi-refresh.service.d so the unit is harmless on non-GPU VMs.
+#
+# Why this is needed for the B200 Service VM specifically:
+#   The qemu-shared-nvswitch SVM is GPU-LESS by design (it owns only the
+#   four CX7 LPFs that drive the NVSwitch fabric, not the eight B200 GPUs
+#   which are passed straight through to tenant VMs). nvidia-imex however
+#   tries to open() /dev/nvidiactl at startup — which on a GPU-less VM
+#   does not exist (no nvidia.ko loaded, no /dev/nvidia* nodes). The
+#   daemon then fails with NV_ERR_OPERATING_SYSTEM ("Failed to allocate
+#   handle to NVIDIA GPU driver") and exits.
+#
+#   That failure on its own would be tolerable, except the upstream
+#   nvidia-imex.service is Type=forking + TimeoutStartSec=infinity. The
+#   forking handshake gets stuck because the child exits BEFORE signaling
+#   the parent that it daemonized successfully, so the parent hangs in
+#   sigtimedwait() forever and `systemctl start nvidia-imex.service`
+#   never returns. (Observed first on 2026-05-20: svc-vm-bootstrap.sh
+#   §0c hung 5+ min waiting on the start; verbose log showed the
+#   NV_ERR_OPERATING_SYSTEM, /proc/PID/wchan = do_sigtimedwait.)
+#
+#   ExecCondition=/usr/local/bin/check-nvidia-gpu skips the unit cleanly
+#   on any VM that doesn't have a 10de:* PCI device — i.e. the SVM —
+#   the same way nvidia-fabricmanager already does. Tenant VMs (which
+#   DO have GPUs) still start nvidia-imex normally.
+ExecCondition=/usr/local/bin/check-nvidia-gpu
+
+# Belt-and-suspenders: cap any future forking-handshake hang at 2 min
+# instead of inheriting the upstream `TimeoutStartSec=infinity`. Even
+# on a real GPU node, an indefinite hang here would mask a real
+# config error (e.g. malformed nodes_config.cfg) and stall the whole
+# unit dependency graph. 120s is generous enough for the legitimate
+# case (driver init + IMEX cluster bootstrap) without being unbounded.
+TimeoutStartSec=120
diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/lib/systemd/system/nvidia-persistenced.service.d/override.conf b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/lib/systemd/system/nvidia-persistenced.service.d/override.conf
index c81de87569..d30225d48f 100644
--- a/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/lib/systemd/system/nvidia-persistenced.service.d/override.conf
+++ b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/lib/systemd/system/nvidia-persistenced.service.d/override.conf
@@ -3,6 +3,10 @@ After=nvidia-fabricmanager.service
 
 [Service]
 ExecCondition=/usr/local/bin/check-nvidia-gpu
+# Block daemon startup until every visible GPU has fabric.state=Completed.
+# See /usr/local/bin/wait-nvlink-fabric.sh for the rationale (B200 CC race).
+ExecStartPre=/usr/local/bin/wait-nvlink-fabric.sh
+
 ExecStart=
 ExecStart=/usr/bin/nvidia-persistenced --user nvidia-persistenced --uvm-persistence-mode --verbose
 ExecStartPost=/usr/bin/nvidia-smi conf-compute -srs 1
diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/local/bin/wait-nvlink-fabric.sh b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/local/bin/wait-nvlink-fabric.sh
new file mode 100755
index 0000000000..28eafab87b
--- /dev/null
+++ b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/local/bin/wait-nvlink-fabric.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# wait-nvlink-fabric.sh — block until NVML reports the NVLink fabric
+# registration is COMPLETE on every visible GPU.
+#
+# Why this exists (HGX B200 + CC/NVLE):
+#   On B200 in confidential-compute mode, the Service VM Fabric Manager
+#   programs NVSwitch routing for a partition asynchronously. The handshake
+#   with the guest GPU happens over in-band NVLink MAD (Probe Request ->
+#   Probe Response) AFTER fmActivateFabricPartition() returns success.
+#   nvidia-persistenced therefore races the handshake on guest boot. If it
+#   registers a GPU before that GPU finishes its handshake, NVML returns
+#   0x81 (NVLINK_FABRIC_NOT_READY) and the daemon silently falls back to
+#   non-UVM persistence — leaving that GPU permanently unable to do NVLink
+#   P2P. Per the NVIDIA Secure AI Operations Guide ("Ensure that
+#   Persistence Mode is On"), the only way to recover from a missed
+#   SPDM/UVM session in CC mode is an FLR, i.e. a full VM restart. So we
+#   gate persistenced startup on fabric readiness rather than rely on the
+#   daemons internal retry path (it does not retry on 0x81).
+#
+# This script is wired in via the nvidia-persistenced service drop-in
+#   /usr/lib/systemd/system/nvidia-persistenced.service.d/override.conf
+# as ExecStartPre. It exits non-zero on timeout so the daemon fails loud
+# rather than silently degrading.
+
+set -euo pipefail
+
+TIMEOUT_SEC="${WAIT_FABRIC_TIMEOUT:-180}"
+POLL_SEC="${WAIT_FABRIC_POLL:-2}"
+
+log() { printf "[wait-nvlink-fabric] %s\n" "$*" >&2; }
+
+deadline=$(( $(date +%s) + TIMEOUT_SEC ))
+attempt=0
+
+while :; do
+    attempt=$((attempt+1))
+    states=$(nvidia-smi --query-gpu=index,uuid,fabric.state,fabric.status \
+                --format=csv,noheader 2>/dev/null || true)
+
+    if [[ -z "$states" ]]; then
+        log "attempt $attempt: nvidia-smi returned empty (driver not ready yet)"
+    else
+        not_ready=$(echo "$states" | awk -F"," "
+            BEGIN { n = 0 }
+            {
+                gsub(/^ +| +\$/, \"\", \$3)
+                gsub(/^ +| +\$/, \"\", \$4)
+                if (\$3 != \"Completed\" || \$4 != \"Success\") n++
+            }
+            END { print n+0 }")
+
+        if (( not_ready == 0 )); then
+            log "all GPUs fabric ready (attempt $attempt)"
+            echo "$states" | sed "s/^/[wait-nvlink-fabric]   /" >&2
+            exit 0
+        fi
+        log "attempt $attempt: $not_ready GPU(s) fabric not yet Completed/Success"
+    fi
+
+    if (( $(date +%s) >= deadline )); then
+        log "FAIL: NVLink fabric did not become ready within ${TIMEOUT_SEC}s"
+        log "current per-GPU state:"
+        echo "${states:-<empty>}" | sed "s/^/[wait-nvlink-fabric]   /" >&2
+        exit 1
+    fi
+
+    sleep "$POLL_SEC"
+done
diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/src/fmctl-probe/fmctl-probe.cpp b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/src/fmctl-probe/fmctl-probe.cpp
new file mode 100644
index 0000000000..7296770166
--- /dev/null
+++ b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/src/fmctl-probe/fmctl-probe.cpp
@@ -0,0 +1,407 @@
+// fmctl-probe: minimal client for the NVIDIA Fabric Manager SDK.
+//
+// Purpose
+// -------
+// Phase-0 spike for the Service-VM model. Validates that an FM running in
+// FABRIC_MODE=1 inside the Service VM:
+//   1. accepts FM-SDK calls on its TCP port (default 127.0.0.1:6666),
+//   2. enumerates the supported "shared NVSwitch" partition catalogue,
+//   3. activates / deactivates a chosen partition.
+//
+// Build / install
+// ---------------
+// The CANONICAL build path is image-time, NOT runtime. This source is
+// vendored into cloud-api-adaptor at:
+//   src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/usr/src/fmctl-probe/fmctl-probe.cpp
+// and compiled by mkosi.postinst inside the rootfs (chroot ${BUILDROOT})
+// against the image's own libnvfm from nvidia-fabricmanager-dev. The
+// resulting /usr/local/bin/fmctl-probe ships baked into the podvm qcow2.
+// The two copies (this one and the CAA one) MUST stay byte-identical;
+// run-podvm-build / 04-build-podvm-locally.sh do not auto-sync them.
+// verify-svc-vm.sh enforces presence of /usr/local/bin/fmctl-probe and the
+// `resolve` subcommand, so an out-of-date or stripped image fails closed.
+//
+// Manual (host- or SVM-side rebuild for ad-hoc debugging only):
+//     g++ -std=c++17 -O2 fmctl-probe.cpp -lnvfm -o fmctl-probe
+//
+// Usage
+// -----
+//     fmctl-probe list                          # dump partition catalogue
+//     fmctl-probe activate <id>                 # activate partition <id>
+//     fmctl-probe deactivate <id>               # deactivate partition <id>
+//     fmctl-probe resolve <bdf,bdf,...>         # match by FM-reported pciBusId
+//     fmctl-probe resolve-by-physids <id,id,..> # match by FM-reported physicalId
+//
+// `resolve` works when FM has GPU BDF info populated (i.e. in single-host
+// non-FABRIC_MODE setups where FM and the GPUs share an OS instance and the
+// NVIDIA driver is loaded locally). In our qemu-shared-nvswitch SVM topology
+// FM runs in FABRIC_MODE=1 with NO GPUs in its OS (the GPUs live in tenant
+// VMs), so fmGetSupportedFabricPartitions().gpuInfo[].pciBusId is empty for
+// every entry and `resolve` ALWAYS returns "no match". Use
+// `resolve-by-physids` instead in that case: pass a comma-separated list of
+// physicalId integers (the host computes them from the GPU PCI BDFs sorted
+// by bus number, the canonical B200 HGX baseboard mapping) and we match
+// against fmGetSupportedFabricPartitions().gpuInfo[].physicalId, which IS
+// populated by FM regardless of who owns the GPUs.
+//
+// Both resolve flavors exit 0 with the id on stdout, 2 on no match,
+// 3 on ambiguous.
+//
+// Override target with FM_ADDR env var, e.g. FM_ADDR=127.0.0.1:6666.
+
+#include <nv_fm_agent.h>
+
+#include <algorithm>
+#include <cctype>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+
+static const char* fmReturnStr(fmReturn_t r) {
+    switch (r) {
+        case FM_ST_SUCCESS: return "SUCCESS";
+        case FM_ST_BADPARAM: return "BADPARAM";
+        case FM_ST_GENERIC_ERROR: return "GENERIC_ERROR";
+        case FM_ST_NOT_SUPPORTED: return "NOT_SUPPORTED";
+        case FM_ST_UNINITIALIZED: return "UNINITIALIZED";
+        case FM_ST_TIMEOUT: return "TIMEOUT";
+        case FM_ST_VERSION_MISMATCH: return "VERSION_MISMATCH";
+        case FM_ST_IN_USE: return "IN_USE";
+        case FM_ST_NOT_CONFIGURED: return "NOT_CONFIGURED";
+        case FM_ST_CONNECTION_NOT_VALID: return "CONNECTION_NOT_VALID";
+        case FM_ST_NVLINK_ERROR: return "NVLINK_ERROR";
+        case FM_ST_RESOURCE_BAD: return "RESOURCE_BAD";
+        case FM_ST_RESOURCE_IN_USE: return "RESOURCE_IN_USE";
+        case FM_ST_RESOURCE_NOT_IN_USE: return "RESOURCE_NOT_IN_USE";
+        case FM_ST_RESOURCE_EXHAUSTED: return "RESOURCE_EXHAUSTED";
+        case FM_ST_RESOURCE_NOT_READY: return "RESOURCE_NOT_READY";
+        case FM_ST_PARTITION_EXISTS: return "PARTITION_EXISTS";
+        case FM_ST_PARTITION_ID_IN_USE: return "PARTITION_ID_IN_USE";
+        case FM_ST_PARTITION_ID_NOT_IN_USE: return "PARTITION_ID_NOT_IN_USE";
+        case FM_ST_NOT_READY: return "NOT_READY";
+        default: return "UNKNOWN";
+    }
+}
+
+static fmReturn_t connectFm(fmHandle_t* handle) {
+    const char* addr = std::getenv("FM_ADDR");
+    if (!addr || !*addr) addr = "127.0.0.1:6666";
+
+    fmConnectParams_t params{};
+    params.version = fmConnectParams_version;
+    std::snprintf(params.addressInfo, sizeof(params.addressInfo), "%s", addr);
+    params.timeoutMs = 5000;
+    params.addressIsUnixSocket = 0;
+    params.addressType = NV_FM_API_ADDR_TYPE_INET;
+
+    fmReturn_t r = fmConnect(&params, handle);
+    if (r != FM_ST_SUCCESS) {
+        std::fprintf(stderr, "fmConnect(%s) failed: %s (%d)\n", addr, fmReturnStr(r), r);
+    } else {
+        // To stderr, not stdout: stdout is reserved for machine-readable
+        // output of subcommands (e.g. resolve-by-physids prints just the
+        // partition id on stdout, so any banner here would be parsed as
+        // part of the id by the calling shell `pid=$(fmctl-probe ...)`).
+        std::fprintf(stderr, "[fmctl] connected to %s\n", addr);
+    }
+    return r;
+}
+
+static int doList(fmHandle_t h) {
+    fmFabricPartitionList_t list{};
+    list.version = fmFabricPartitionList_version;
+    fmReturn_t r = fmGetSupportedFabricPartitions(h, &list);
+    if (r != FM_ST_SUCCESS) {
+        std::fprintf(stderr, "fmGetSupportedFabricPartitions failed: %s (%d)\n", fmReturnStr(r), r);
+        return 1;
+    }
+
+    std::printf("supported partitions: %u (max %u on this platform)\n",
+                list.numPartitions, list.maxNumPartitions);
+    for (unsigned i = 0; i < list.numPartitions; ++i) {
+        const auto& p = list.partitionInfo[i];
+        std::printf("  partition id=%u  active=%u  numGpus=%u\n",
+                    p.partitionId, p.isActive, p.numGpus);
+        for (unsigned g = 0; g < p.numGpus; ++g) {
+            const auto& gpu = p.gpuInfo[g];
+            std::printf("    gpu physicalId=%u  uuid=%s  pci=%s  nvlinks=%u/%u\n",
+                        gpu.physicalId, gpu.uuid, gpu.pciBusId,
+                        gpu.numNvLinksAvailable, gpu.maxNumNvLinks);
+        }
+    }
+    return 0;
+}
+
+static int doActivate(fmHandle_t h, fmFabricPartitionId_t id) {
+    fmReturn_t r = fmActivateFabricPartition(h, id);
+    std::printf("fmActivateFabricPartition(%u) -> %s (%d)\n", id, fmReturnStr(r), r);
+    return r == FM_ST_SUCCESS ? 0 : 1;
+}
+
+static int doDeactivate(fmHandle_t h, fmFabricPartitionId_t id) {
+    fmReturn_t r = fmDeactivateFabricPartition(h, id);
+    std::printf("fmDeactivateFabricPartition(%u) -> %s (%d)\n", id, fmReturnStr(r), r);
+    return r == FM_ST_SUCCESS ? 0 : 1;
+}
+
+// Normalize a PCI BDF string to canonical form: "DDDDDDDD:BB:DD.F" (lowercase).
+// Accepts the four shapes that show up in practice:
+//   "c0:00.0"           -- bus:dev.fn (domain implicit zero); operator typed
+//   "0000:c0:00.0"      -- 4-hex-digit domain; some Linux tools
+//   "00000000:c0:00.0"  -- 8-hex-digit domain; what NVML/FM-SDK returns
+//   "00000000:C0:00.0"  -- same with uppercase bus
+// Returns empty string if the input is malformed. Used by doResolve() so that
+// operator-typed BDFs (`--gpus c0:00.0,d8:00.0`) compare equal to FM-reported
+// BDFs (`fmFabricPartitionGpuInfo_t::pciBusId == "00000000:C0:00.0"`) regardless
+// of casing or domain prefix.
+static std::string normalizeBdf(const std::string& in) {
+    unsigned dom = 0, bus = 0, dev = 0, fn = 0;
+    int n = 0;
+    if (std::sscanf(in.c_str(), "%x:%x:%x.%x%n", &dom, &bus, &dev, &fn, &n) == 4
+        && n == static_cast<int>(in.size())) {
+        // domain:bus:dev.fn given (any width domain)
+    } else if (std::sscanf(in.c_str(), "%x:%x.%x%n", &bus, &dev, &fn, &n) == 3
+               && n == static_cast<int>(in.size())) {
+        dom = 0;  // domain omitted; treat as zero per Linux PCI convention
+    } else {
+        return "";
+    }
+    char buf[32];
+    std::snprintf(buf, sizeof(buf), "%08x:%02x:%02x.%x", dom, bus, dev, fn);
+    return buf;
+}
+
+// Resolve a comma-separated BDF list to the unique partition id whose GPU
+// PCI BDF set is exactly the input set. This implements the runtime side of
+// the "operator declares GPUs, FM picks the partition" contract documented
+// in docs/PARTITION-MAPPING.md.
+//
+// Exit codes:
+//   0  unique match; partition id printed on stdout
+//   1  fmGetSupportedFabricPartitions() failed (unrecoverable)
+//   2  no supported partition matches the requested BDF set; or input parse fail
+//   3  more than one partition matches (this should never happen on a Blackwell
+//      baseboard since each (numGpus, gpu-set) combination is unique, but we
+//      fail loud rather than silently activate the first hit)
+static int doResolve(fmHandle_t h, const std::string& bdfsCsv) {
+    fmFabricPartitionList_t list{};
+    list.version = fmFabricPartitionList_version;
+    fmReturn_t r = fmGetSupportedFabricPartitions(h, &list);
+    if (r != FM_ST_SUCCESS) {
+        std::fprintf(stderr,
+            "fmGetSupportedFabricPartitions failed: %s (%d)\n",
+            fmReturnStr(r), r);
+        return 1;
+    }
+
+    // Parse + normalize the requested BDF list into a sorted set.
+    std::set<std::string> wanted;
+    {
+        std::stringstream ss(bdfsCsv);
+        std::string tok;
+        while (std::getline(ss, tok, ',')) {
+            tok.erase(std::remove_if(tok.begin(), tok.end(),
+                                     [](unsigned char c){ return std::isspace(c); }),
+                      tok.end());
+            if (tok.empty()) continue;
+            std::string normalized = normalizeBdf(tok);
+            if (normalized.empty()) {
+                std::fprintf(stderr,
+                    "resolve: cannot parse BDF '%s' "
+                    "(expected DDDD:BB:DD.F or BB:DD.F)\n",
+                    tok.c_str());
+                return 2;
+            }
+            wanted.insert(normalized);
+        }
+    }
+    if (wanted.empty()) {
+        std::fprintf(stderr, "resolve: empty BDF list\n");
+        return 2;
+    }
+
+    // Walk every supported partition, comparing its GPU BDF set against `wanted`.
+    std::vector<unsigned> matches;
+    for (unsigned i = 0; i < list.numPartitions; ++i) {
+        const auto& p = list.partitionInfo[i];
+        if (p.numGpus != wanted.size()) continue;  // fast reject on size
+        std::set<std::string> have;
+        for (unsigned g = 0; g < p.numGpus; ++g) {
+            std::string normalized = normalizeBdf(p.gpuInfo[g].pciBusId);
+            if (normalized.empty()) continue;
+            have.insert(normalized);
+        }
+        if (have == wanted) {
+            matches.push_back(p.partitionId);
+        }
+    }
+
+    if (matches.empty()) {
+        std::fprintf(stderr,
+            "resolve: no supported partition matches BDF set {");
+        bool first = true;
+        for (const auto& b : wanted) {
+            std::fprintf(stderr, "%s%s", first ? "" : ",", b.c_str());
+            first = false;
+        }
+        std::fprintf(stderr,
+            "} -- check `fmctl-probe list` for the supported partition catalogue.\n");
+        return 2;
+    }
+    if (matches.size() > 1) {
+        std::fprintf(stderr,
+            "resolve: AMBIGUOUS -- %zu partitions match BDF set:",
+            matches.size());
+        for (unsigned id : matches) std::fprintf(stderr, " %u", id);
+        std::fprintf(stderr,
+            "\n(this is unexpected on a Blackwell baseboard; report to NVIDIA.)\n");
+        return 3;
+    }
+
+    // stdout: just the partition id, machine-readable for shell wrappers.
+    std::printf("%u\n", matches[0]);
+    return 0;
+}
+
+// Resolve a comma-separated physicalId list to the unique partition id whose
+// GPU physicalId set is exactly the input. Used in FABRIC_MODE=1 / shared
+// NVSwitch topologies where FM has no local GPUs and so pciBusId is empty
+// in fmGetSupportedFabricPartitions(); physicalId is still populated.
+//
+// Exit codes mirror doResolve():
+//   0  unique match; partition id printed on stdout
+//   1  fmGetSupportedFabricPartitions() failed (unrecoverable)
+//   2  no supported partition matches the requested physicalId set; or parse fail
+//   3  more than one partition matches (should not happen on Blackwell)
+static int doResolveByPhysIds(fmHandle_t h, const std::string& idsCsv) {
+    fmFabricPartitionList_t list{};
+    list.version = fmFabricPartitionList_version;
+    fmReturn_t r = fmGetSupportedFabricPartitions(h, &list);
+    if (r != FM_ST_SUCCESS) {
+        std::fprintf(stderr,
+            "fmGetSupportedFabricPartitions failed: %s (%d)\n",
+            fmReturnStr(r), r);
+        return 1;
+    }
+
+    // Parse the requested physicalId list into a sorted set.
+    std::set<unsigned> wanted;
+    {
+        std::stringstream ss(idsCsv);
+        std::string tok;
+        while (std::getline(ss, tok, ',')) {
+            tok.erase(std::remove_if(tok.begin(), tok.end(),
+                                     [](unsigned char c){ return std::isspace(c); }),
+                      tok.end());
+            if (tok.empty()) continue;
+            char* end = nullptr;
+            unsigned long v = std::strtoul(tok.c_str(), &end, 10);
+            if (!end || *end != '\0') {
+                std::fprintf(stderr,
+                    "resolve-by-physids: cannot parse physicalId '%s' "
+                    "(expected integer)\n",
+                    tok.c_str());
+                return 2;
+            }
+            wanted.insert(static_cast<unsigned>(v));
+        }
+    }
+    if (wanted.empty()) {
+        std::fprintf(stderr, "resolve-by-physids: empty id list\n");
+        return 2;
+    }
+
+    std::vector<unsigned> matches;
+    for (unsigned i = 0; i < list.numPartitions; ++i) {
+        const auto& p = list.partitionInfo[i];
+        if (p.numGpus != wanted.size()) continue;
+        std::set<unsigned> have;
+        for (unsigned g = 0; g < p.numGpus; ++g) {
+            have.insert(p.gpuInfo[g].physicalId);
+        }
+        if (have == wanted) {
+            matches.push_back(p.partitionId);
+        }
+    }
+
+    if (matches.empty()) {
+        std::fprintf(stderr,
+            "resolve-by-physids: no supported partition matches physicalId set {");
+        bool first = true;
+        for (unsigned id : wanted) {
+            std::fprintf(stderr, "%s%u", first ? "" : ",", id);
+            first = false;
+        }
+        std::fprintf(stderr,
+            "} -- check `fmctl-probe list` for the supported partition catalogue.\n");
+        return 2;
+    }
+    if (matches.size() > 1) {
+        std::fprintf(stderr,
+            "resolve-by-physids: AMBIGUOUS -- %zu partitions match physicalId set:",
+            matches.size());
+        for (unsigned id : matches) std::fprintf(stderr, " %u", id);
+        std::fprintf(stderr,
+            "\n(this is unexpected on a Blackwell baseboard; report to NVIDIA.)\n");
+        return 3;
+    }
+
+    std::printf("%u\n", matches[0]);
+    return 0;
+}
+
+static void usage(const char* argv0) {
+    std::fprintf(stderr,
+        "Usage: %s <command> [args]\n"
+        "Commands:\n"
+        "  list                          enumerate supported partitions\n"
+        "  activate <id>                 activate partition <id>\n"
+        "  deactivate <id>               deactivate partition <id>\n"
+        "  resolve <bdf,bdf,...>         match by FM-reported pciBusId\n"
+        "                                  (works only when FM has local GPUs)\n"
+        "  resolve-by-physids <id,id,..> match by FM-reported physicalId\n"
+        "                                  (use in FABRIC_MODE=1 / shared NVSwitch\n"
+        "                                  -- pciBusId is empty in that mode)\n"
+        "Both resolve flavors: exit 0 + id on stdout, 2 if no match, 3 if ambiguous.\n"
+        "Environment:\n"
+        "  FM_ADDR                       FM SDK address (default 127.0.0.1:6666)\n",
+        argv0);
+}
+
+int main(int argc, char** argv) {
+    if (argc < 2) { usage(argv[0]); return 2; }
+    std::string cmd = argv[1];
+
+    fmReturn_t init = fmLibInit();
+    if (init != FM_ST_SUCCESS) {
+        std::fprintf(stderr, "fmLibInit failed: %s (%d)\n", fmReturnStr(init), init);
+        return 1;
+    }
+
+    fmHandle_t h = nullptr;
+    if (connectFm(&h) != FM_ST_SUCCESS) { fmLibShutdown(); return 1; }
+
+    int rc = 0;
+    if (cmd == "list") {
+        rc = doList(h);
+    } else if (cmd == "activate" && argc >= 3) {
+        rc = doActivate(h, static_cast<fmFabricPartitionId_t>(std::atoi(argv[2])));
+    } else if (cmd == "deactivate" && argc >= 3) {
+        rc = doDeactivate(h, static_cast<fmFabricPartitionId_t>(std::atoi(argv[2])));
+    } else if (cmd == "resolve" && argc >= 3) {
+        rc = doResolve(h, std::string(argv[2]));
+    } else if (cmd == "resolve-by-physids" && argc >= 3) {
+        rc = doResolveByPhysIds(h, std::string(argv[2]));
+    } else {
+        usage(argv[0]);
+        rc = 2;
+    }
+
+    fmDisconnect(h);
+    fmLibShutdown();
+    return rc;
+}
diff --git a/src/cloud-api-adaptor/podvm/Dockerfile.podvm_binaries.ubuntu b/src/cloud-api-adaptor/podvm/Dockerfile.podvm_binaries.ubuntu
index ca7cb06dac..ec62bb4199 100644
--- a/src/cloud-api-adaptor/podvm/Dockerfile.podvm_binaries.ubuntu
+++ b/src/cloud-api-adaptor/podvm/Dockerfile.podvm_binaries.ubuntu
@@ -17,6 +17,8 @@ ARG CUSTOM_GC_BINARIES=""
 ARG AA_FEATURES=""
 ARG GUEST_COMPONENTS_REF=""
 ARG GUEST_COMPONENTS_REPO="https://github.com/confidential-containers/guest-components.git"
+ARG NVAT_REPO="https://github.com/NVIDIA/attestation-sdk.git"
+ARG NVAT_TAG="2026.03.02"
 ARG DEBIAN_FRONTEND=noninteractive
 RUN set -e; \
     if [ -n "${CUSTOM_GC_BINARIES}" ] && [ -z "${GUEST_COMPONENTS_REF}" ]; then \
@@ -26,7 +28,8 @@ RUN set -e; \
     if [ -n "${CUSTOM_GC_BINARIES}" ] && [ -n "${GUEST_COMPONENTS_REF}" ]; then \
       apt-get update && \
       apt-get install -y --no-install-recommends \
-        protobuf-compiler pkg-config clang libssl-dev libtss2-dev && \
+        protobuf-compiler pkg-config clang libclang-dev libssl-dev libtss2-dev \
+        cmake libcurl4-openssl-dev libxml2-dev libxmlsec1-dev libxmlsec1-openssl && \
       apt-get clean && rm -rf /var/lib/apt/lists/* && \
       mkdir -p /build/gc && cd /build/gc && \
       git init && \
@@ -34,7 +37,25 @@ RUN set -e; \
       git fetch --depth=1 origin "${GUEST_COMPONENTS_REF}" && \
       git reset --hard FETCH_HEAD; \
     fi
+# Build NVIDIA Attestation SDK (libnvat) from source so the nvidia-attester
+# feature can link against it. The AA binary will dynamically link libnvat.so,
+# which must also be present in the final PodVM image at runtime.
+# Installs its own build deps so this works even without CUSTOM_GC_BINARIES.
+RUN set -e; \
+    if echo "${AA_FEATURES}" | grep -q "nvidia-attester"; then \
+      apt-get update && \
+      apt-get install -y --no-install-recommends cmake libssl-dev libcurl4-openssl-dev \
+        libxml2-dev libxmlsec1-dev libxmlsec1-openssl && \
+      apt-get clean && rm -rf /var/lib/apt/lists/* && \
+      git clone --depth 1 --branch "${NVAT_TAG}" "${NVAT_REPO}" /build/nvat && \
+      cd /build/nvat/nv-attestation-sdk-cpp && \
+      cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_INSTALL_LIBDIR=lib && \
+      cmake --build build && \
+      cmake --install build && \
+      ldconfig; \
+    fi
 COPY cloud-api-adaptor/podvm/build-guest-components.sh /build/
+ENV NVAT_USE_SYSTEM_LIB=1
 RUN /build/build-guest-components.sh "${CUSTOM_GC_BINARIES}" "${AA_FEATURES}"
 
 # ubuntu:24.04
@@ -146,5 +167,14 @@ RUN for bin in /tmp/gc-overrides/*; do \
       install -m0755 "$bin" /src/cloud-api-adaptor/podvm/files/usr/local/bin/"$(basename "$bin")"; \
     done; true
 
+# Copy libnvat shared library if it was built (needed at runtime by attestation-agent
+# when compiled with nvidia-attester feature). Uses a mount instead of COPY so
+# builds without nvidia-attester don't fail on an empty glob.
+RUN --mount=from=gc_builder,src=/usr/lib/,dst=/tmp/gc-lib/,readonly \
+    if ls /tmp/gc-lib/libnvat* 1>/dev/null 2>&1; then \
+      mkdir -p /src/cloud-api-adaptor/podvm/files/usr/lib/ && \
+      cp /tmp/gc-lib/libnvat* /src/cloud-api-adaptor/podvm/files/usr/lib/; \
+    fi; true
+
 FROM scratch
 COPY --from=podvm_binaries_builder /src/cloud-api-adaptor/podvm/files /
diff --git a/src/cloud-api-adaptor/podvm/build-guest-components.sh b/src/cloud-api-adaptor/podvm/build-guest-components.sh
index 2f586d4352..3d9de4f419 100755
--- a/src/cloud-api-adaptor/podvm/build-guest-components.sh
+++ b/src/cloud-api-adaptor/podvm/build-guest-components.sh
@@ -30,6 +30,10 @@ for bin in "${BINS[@]}"; do
         exit 1
       fi
       cd /build/gc/attestation-agent/attestation-agent
+      # Refresh lockfile so optional feature deps (e.g. nv-attestation-sdk
+      # for nvidia-attester) are resolved even if the checked-in Cargo.lock
+      # was generated without them.
+      cargo update --workspace
       cargo build --release --locked --no-default-features \
         --features "$AA_FEATURES" --bin ttrpc-aa
       cp /build/gc/target/release/ttrpc-aa "$OUTDIR/attestation-agent"