cohere-ai · alhassankhedr-cohere · May 8, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/.github/workflows/build-podvm-cohere.yaml b/.github/workflows/build-podvm-cohere.yaml
@@ -22,10 +22,20 @@ on:
         type: string
         default: "https://github.com/cohere-ai/guest-components.git"
       guest_components_ref:
-        description: "guest-components ref (default: cohere)"
+        description: |
+          guest-components ref (branch, tag, or SHA).
+
+          Default: alhassankhedr/sync-main-to-cohere (head of PR #9).
+          That branch carries upstream main's nvidia-attester rewrite
+          (NVAT SDK based, no `count == 1` guard) and is required for
+          multi-GPU evidence to work end-to-end on 8x B200 hosts. The
+          plain `cohere` branch still has the old NVML-based attester
+          which silently produces empty evidence on 2+ GPU systems
+          (mod a sed `s/count == 1/count >= 1/` patch the podvm-mkosi
+          Dockerfile applies). Switch back to `cohere` after PR #9 merges.
         required: false
         type: string
-        default: "cohere"
+        default: "alhassankhedr/sync-main-to-cohere"
       custom_gc_binaries:
         description: "guest-components binaries to build from source"
         required: false
@@ -46,6 +56,11 @@ on:
         required: false
         type: boolean
         default: false
+      b200_cc_drivers:
+        description: "Install NVIDIA 595.71.05 open driver (enables Confidential Computing on multi-GPU B200). EXPERIMENTAL"
+        required: false
+        type: boolean
+        default: false
 
 permissions:
   id-token: write      # OIDC token for build provenance attestation
@@ -70,11 +85,13 @@ jobs:
       image_name_debug: ${{ steps.compute.outputs.image_name_debug }}
       image_tag_release: ${{ steps.compute.outputs.image_tag_release }}
       image_tag_debug: ${{ steps.compute.outputs.image_tag_debug }}
+      b200_cc_drivers: ${{ steps.compute.outputs.b200_cc_drivers }}
     steps:
       - name: Compute tags and image names
         id: compute
         env:
           DISTRO: ${{ inputs.distro || 'ubuntu' }}
+          B200_CC_DRIVERS: ${{ inputs.b200_cc_drivers && 'true' || 'false' }}
         run: |
           if [[ "$GITHUB_REF" == refs/tags/podvm-v* ]]; then
             TAG="${GITHUB_REF#refs/tags/podvm-}"
@@ -84,10 +101,15 @@ jobs:
             REPLACE_IMAGE="true"
           fi
           TAG="${TAG//./-}"
+          # Suffix CC-driver builds so they never collide with standard images
+          if [ "$B200_CC_DRIVERS" = "true" ]; then
+            TAG="${TAG}-cc595"
+          fi
           {
             echo "tag=$TAG"
             echo "distro=$DISTRO"
             echo "replace_image=$REPLACE_IMAGE"
+            echo "b200_cc_drivers=$B200_CC_DRIVERS"
             echo "image_name_release=podvm-${DISTRO}-${TEE_PLATFORM}-release-${TAG}"
             echo "image_name_debug=podvm-${DISTRO}-${TEE_PLATFORM}-debug-${TAG}"
             echo "image_tag_release=${TAG}-${DISTRO}-release"
@@ -176,7 +198,7 @@ jobs:
           PODVM_DISTRO: ${{ needs.meta.outputs.distro }}
           AA_FEATURES: ${{ inputs.aa_features || 'bin,ttrpc,kbs,coco_as,rust-crypto,tdx-attester,nvidia-attester' }}
           GC_REPO: ${{ inputs.guest_components_repo || 'https://github.com/cohere-ai/guest-components.git' }}
-          GC_REF: ${{ inputs.guest_components_ref || 'cohere' }}
+          GC_REF: ${{ inputs.guest_components_ref || 'alhassankhedr/sync-main-to-cohere' }}
           GC_CUSTOM_BINARIES: ${{ inputs.custom_gc_binaries || 'attestation-agent,api-server-rest' }}
         run: |
           MAKE_ARGS=(
@@ -195,6 +217,47 @@ jobs:
           echo "Disk after binaries build:"
           df -h /
 
+      - name: Override NVIDIA driver to 595.71.05 (B200 multi-GPU CC)
+        if: needs.meta.outputs.b200_cc_drivers == 'true'
+        working-directory: src/cloud-api-adaptor/podvm-mkosi
+        run: |
+          set -euo pipefail
+          CONF=mkosi.presets/system/mkosi.conf.d/ubuntu.conf
+          # The 595 branch only ships the unversioned `nvidia-driver-open`
+          # metapackage in NVIDIA's CUDA repo (no `nvidia-driver-595-open`).
+          # Match by package name only so this survives future 580.x.y bumps.
+          sed -i -E \
+            -e 's|^([[:space:]]*)nvidia-driver-580-open=.*|\1nvidia-driver-open=595.71.05-1ubuntu1|' \
+            -e 's|^([[:space:]]*)nvidia-persistenced=.*|\1nvidia-persistenced=595.71.05-1ubuntu1|' \
+            -e 's|^([[:space:]]*)nvidia-fabricmanager=.*|\1nvidia-fabricmanager=595.71.05-1ubuntu1|' \
+            -e 's|^([[:space:]]*)libnvidia-nscq=.*|\1libnvidia-nscq=595.71.05-1ubuntu1|' \
+            "$CONF"
+          echo "----- Updated NVIDIA package pins -----"
+          grep -E '^[[:space:]]*(nvidia|libnvidia)' "$CONF"
+
+      - name: Increase debug root partition for CC595 drivers
+        if: needs.meta.outputs.b200_cc_drivers == 'true' && matrix.profile == 'debug'
+        working-directory: src/cloud-api-adaptor/podvm-mkosi
+        run: |
+          set -euo pipefail
+          CONF=mkosi.presets/system/mkosi.repart-debug/10-root.conf
+          # NVIDIA 595 drivers make the root filesystem too large for
+          # systemd-repart's Minimize=guess estimation, causing mkfs.ext4
+          # "No space left on device" during the build.
+          printf '[Partition]\nType=root\nFormat=ext4\nCopyFiles=/\nMinimize=off\nSizeMinBytes=12G\nSizeMaxBytes=12G\n' > "$CONF"
+          echo "----- Updated repart config -----"
+          cat "$CONF"
+
+      - name: Resolve installed NVIDIA driver version
+        working-directory: src/cloud-api-adaptor/podvm-mkosi
+        run: |
+          set -euo pipefail
+          CONF=mkosi.presets/system/mkosi.conf.d/ubuntu.conf
+          DRIVER_LINE=$(grep -E '^[[:space:]]*nvidia-driver(-580)?-open=' "$CONF" | head -n1)
+          DRIVER_VER=$(printf '%s' "$DRIVER_LINE" | sed -E 's|.*=([0-9]+\.[0-9]+\.[0-9]+).*|\1|')
+          echo "Resolved NVIDIA driver version: $DRIVER_VER"
+          echo "NVIDIA_DRIVER=$DRIVER_VER" >> "$GITHUB_ENV"
+
       - name: Build OS image
         working-directory: src/cloud-api-adaptor/podvm-mkosi
         env:
@@ -265,6 +328,7 @@ jobs:
             --arg distro "$DISTRO" \
             --arg profile "$PROFILE" \
             --arg tee_platform "$TEE_PLATFORM" \
+            --arg nvidia_driver "$NVIDIA_DRIVER" \
             --arg build_date "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
             '$ARGS.named' > /tmp/measurements.json
 
@@ -307,6 +371,7 @@ jobs:
             --annotation "com.cohere.caa.commit=${CAA_COMMIT}" \
             --annotation "com.cohere.caa.version=${GITHUB_REF_NAME}" \
             --annotation "com.cohere.rtmr2=${RTMR2}" \
+            --annotation "com.cohere.nvidia.driver=${NVIDIA_DRIVER}" \
             --format json > oras-output.json
 
           cat oras-output.json

diff --git a/src/cloud-api-adaptor/podvm-mkosi/Dockerfile.mkosi.ubuntu b/src/cloud-api-adaptor/podvm-mkosi/Dockerfile.mkosi.ubuntu
@@ -51,17 +51,40 @@ COPY mkosi.conf.ubuntu /image/mkosi.conf
 
 # Add NVIDIA APT repos to mkosi.skeleton/ so they are present in the image tree
 # *before* package installation. mkosi applies SkeletonTrees before apt-get runs.
+#
+# Three NVIDIA repos are wired in:
+#   1. CUDA repo (developer.download.nvidia.com/compute/cuda)
+#      Provides nvidia-driver-open, nvidia-fabricmanager, libnvidia-nscq,
+#      nvidia-imex, libnvsdm, nvlsm, collectx-bringup, mft, mft-oem,
+#      mft-autocomplete, etc. — the full R595 + nvlink5 stack EXCEPT for
+#      the `ucx` dependency that collectx-bringup pulls in.
+#   2. nvidia-container-toolkit repo (nvidia.github.io/libnvidia-container)
+#      Provides nvidia-container-toolkit and friends.
+#   3. NVIDIA DOCA-Host networking repo (linux.mellanox.com/public/repo/doca)
+#      Provides `ucx` (Unified Communication X) and the matching MOFED
+#      userspace stack. We need it ONLY for `ucx` (so collectx-bringup's
+#      Depends: ucx resolves) and as an alternate source for mft*/
+#      collectx-bringup. The repo is pinned at priority 100 so it acts as
+#      a fallback — packages already available from the cuda repo or from
+#      Ubuntu universe (e.g. rdma-core, ibverbs-utils, libibumad3) are
+#      NOT replaced by the MOFED variants, which would otherwise turn the
+#      image into a MOFED-userspace install. Only ucx (which exists ONLY
+#      in DOCA) and any explicitly-requested package fall through to it.
 RUN mkdir -p /image/mkosi.skeleton/etc/apt/sources.list.d \
              /image/mkosi.skeleton/etc/apt/preferences.d \
              /image/mkosi.skeleton/usr/share/keyrings \
     && curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-archive-keyring.gpg \
        -o /image/mkosi.skeleton/usr/share/keyrings/cuda-archive-keyring.gpg \
     && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
        | gpg --dearmor -o /image/mkosi.skeleton/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+    && curl -fsSL https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox \
+       | gpg --dearmor -o /image/mkosi.skeleton/usr/share/keyrings/nvidia-doca-keyring.gpg \
     && echo 'deb [signed-by=/usr/share/keyrings/cuda-archive-keyring.gpg] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/ /' \
        > /image/mkosi.skeleton/etc/apt/sources.list.d/cuda-ubuntu2404-x86_64.list \
     && echo 'deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://nvidia.github.io/libnvidia-container/stable/deb/amd64 /' \
        > /image/mkosi.skeleton/etc/apt/sources.list.d/nvidia-container-toolkit.list \
+    && echo 'deb [signed-by=/usr/share/keyrings/nvidia-doca-keyring.gpg] https://linux.mellanox.com/public/repo/doca/latest/ubuntu24.04/x86_64/ ./' \
+       > /image/mkosi.skeleton/etc/apt/sources.list.d/nvidia-doca-ubuntu2404.list \
     && printf '%s\n' \
        'Package: nvidia-*' \
        'Pin: origin developer.download.nvidia.com' \
@@ -74,7 +97,24 @@ RUN mkdir -p /image/mkosi.skeleton/etc/apt/sources.list.d \
        'Package: cuda-*' \
        'Pin: origin developer.download.nvidia.com' \
        'Pin-Priority: 1001' \
-       > /image/mkosi.skeleton/etc/apt/preferences.d/nvidia-cuda-repo
+       > /image/mkosi.skeleton/etc/apt/preferences.d/nvidia-cuda-repo \
+    && printf '%s\n' \
+       '# Pin the NVIDIA DOCA-Host repo to priority 100. Apt rule for' \
+       '# "Pin-Priority: 100": install ONLY if explicitly requested or as' \
+       '# a dep, never auto-upgrade or replace a package available from' \
+       '# a higher-priority source. Outcome:' \
+       '#   - ucx (only in DOCA)        -> installs from DOCA  ✓ (resolves' \
+       '#                                  collectx-bringup Depends: ucx)' \
+       '#   - rdma-core, ibverbs-utils, libibumad3 (in universe @500 AND' \
+       '#     in DOCA @100)             -> installs from universe ✓' \
+       '#                                  (keeps inbox OFED, not MOFED)' \
+       '#   - collectx-bringup, mft*    -> install from cuda repo (origin' \
+       '#                                  developer.download.nvidia.com,' \
+       '#                                  default priority 500 > 100)' \
+       'Package: *' \
+       'Pin: origin linux.mellanox.com' \
+       'Pin-Priority: 100' \
+       > /image/mkosi.skeleton/etc/apt/preferences.d/nvidia-doca-repo
 
 RUN --security=insecure mkosi --profile=$PROFILE --image-version=$IMAGE_VERSION
 

diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.conf.d/ubuntu.conf b/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.conf.d/ubuntu.conf
@@ -25,11 +25,130 @@ Packages=
     iptables
     e2fsprogs
     cryptsetup
-    nvidia-driver-580-open=580.126.20-1ubuntu1
-    nvidia-persistenced=580.126.20-1ubuntu1
-    nvidia-fabricmanager=580.126.20-1
-    libnvidia-nscq=580.126.20-1
+    nvidia-driver-open=595.71.05-1ubuntu1
+    nvidia-persistenced=595.71.05-1ubuntu1
+    nvidia-fabricmanager=595.71.05-1ubuntu1
+    libnvidia-nscq=595.71.05-1ubuntu1
     nvidia-container-toolkit=1.19.0-1
+    nvidia-fabricmanager-dev=595.71.05-1ubuntu1
+    nvlsm
+    # nvlink5-595 metapackage components — installed individually because
+    # the metapackage is deprecated. Per the NVIDIA FM User Guide
+    # §"Installing Fabric Manager / Systems Using Fourth Generation
+    # NVSwitches", the canonical B200/B300 install path used to be:
+    #   sudo apt-get install -V nvidia-open-<branch>
+    #   sudo apt-get install -V nvlink5-<branch>
+    # The nvlink5 metapackage has no files of its own — it just pulls in
+    # a fixed list of components (see
+    # https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/nvlink5-migration.html).
+    # NVIDIA's documented migration is to install those components
+    # explicitly; the dep list of nvlink5_595.71.05-1 is:
+    #   libnvidia-nscq (>=595)                              — pinned above
+    #   libnvsdm (>=595)                                    — added below
+    #   nvidia-fabricmanager (>=595)                        — pinned above
+    #   nvidia-imex (>=595)                                 — added below
+    #   nvidia-dkms-open / nvidia-kernel-open-dkms (>=595)  — via nvidia-driver-open
+    #   libnvidia-compute / nvidia-driver-cuda (>=595)      — via nvidia-driver-open
+    #   nvlsm (>=2025.10.12)                                — pinned above
+    #   collectx-bringup, mft, mft-oem, mft-autocomplete    — INTENTIONALLY OMITTED, see note below
+    #
+    # Why each one matters (FM Guide §Shared NVSwitch Virtualization Model
+    # and §1259 NVSwitch Errors On DGX B200/B300):
+    #   libnvsdm     — NVSwitch Device Manager telemetry library; replaces
+    #                  the SXID error path on B200/B300. DCGM (added
+    #                  below) reads NVSwitch port/ASIC counters through
+    #                  it; without it NVSwitch faults are invisible.
+    #   nvidia-imex  — Internode Memory Exchange daemon. Brokers secure
+    #                  cross-OS-instance shared CUDA memory channels over
+    #                  NVLink. Required for multi-tenant Shared-NVSwitch
+    #                  workloads where NCCL group formation registers
+    #                  memory regions that cross partition boundaries;
+    #                  without IMEX the GPU's secure subsystem fires
+    #                  Xid 170 SECURE Fatal CROSS_CONTAIN. See fortress
+    #                  scratch/oci-b200/.../docs/KNOWN-ISSUES.md §1
+    #                  "mode C" for the symptom this targets.
+    libnvsdm=595.71.05-1ubuntu1
+    nvidia-imex=595.71.05-1ubuntu1
+    # CX7 / NVSwitch firmware-management and telemetry tools. These come
+    # from NVIDIA's CUDA repo, but their `ucx` dependency comes from
+    # NVIDIA's DOCA-Host networking repo. Both repos are wired into the
+    # mkosi.skeleton apt sources by Dockerfile.mkosi.ubuntu (see comments
+    # there for the priority-100 fallback pin that makes this resolve
+    # without dragging in MOFED userspace).
+    #   collectx-bringup  CX7 bringup utilities for the NVLink-management
+    #                     bridge. Depends on ucx (now installable via
+    #                     DOCA repo). Used by FM/NVLSM internal startup
+    #                     scripts to query CX7 SMDL/VPD info.
+    #   mft / mft-oem /   Mellanox Firmware Tools (mst, flint, mlxconfig,
+    #   mft-autocomplete  mlxlink). Required for B200 LPF firmware
+    #                     diagnostics when the CX7 bridge fails to come
+    #                     up. nvlsm's prelaunch script uses mst to enum
+    #                     management ports.
+    collectx-bringup
+    mft
+    mft-oem
+    mft-autocomplete
+    # Userspace OFED bits that the FM User Guide "NVIDIA Software Packages"
+    # section requires on B200/B300 ("OFED or MOFED package is required").
+    # rdma-core pulls in libibverbs1, librdmacm1, libibmad5, libibumad3,
+    # libibnetdisc5, etc. ibverbs-utils gives ibv_devices/ibv_devinfo for
+    # triage. infiniband-diags below already provides ibstatus/ibstat.
+    rdma-core
+    ibverbs-utils
+    infiniband-diags
+    # Pin libibumad3 explicitly. FM User Guide §"Other NVIDIA Software
+    # Packages" calls out libibumad3 by name as a B200/B300 SVM
+    # requirement. Today this lands transitively via infiniband-diags,
+    # but pinning it makes the package set hermetic against any future
+    # transitive-dep churn and lets verify-svc-vm.sh's
+    # /lib/x86_64-linux-gnu/libibumad.so.3 gate be deterministic.
+    libibumad3
+    # Pin nvidia-utils-595 + nvidia-modprobe explicitly. Today these come
+    # transitively via nvidia-driver-open=595.71.05-1ubuntu1, but:
+    #   nvidia-utils-595      provides /usr/bin/nvidia-smi, which
+    #                         verify-svc-vm.sh and several stage-c-*
+    #                         systemd ExecStartPre hooks invoke directly.
+    #                         Without it the SVM would have a driver
+    #                         loaded but no userspace tools to query it.
+    #   nvidia-modprobe       SUID helper that auto-creates /dev/nvidia*
+    #                         device nodes for non-root NVML callers.
+    #                         Required for nvidia-imex, DCGM, and any
+    #                         tenant-side enumeration that doesn't run
+    #                         as root.
+    nvidia-utils-595=595.71.05-1ubuntu1
+    nvidia-modprobe=595.71.05-1ubuntu1
+    # Data Center GPU Manager (DCGM) v4. Per FM User Guide §"NVSwitch
+    # Errors On DGX B200/B300 and NVIDIA HGX B200/B300 Systems":
+    #   "NVSwitch SXID errors are no longer applicable to DGX B200/B300
+    #    and NVIDIA HGX B200/B300 systems. DCGM now interfaces with a
+    #    library called NVIDIA Switch Device Manager (NVSDM) to fetch
+    #    errors related to NVSwitch."
+    # libnvsdm above gives us the library; DCGM is the consumer that
+    # surfaces those errors as queryable health/telemetry. Without DCGM,
+    # NVSwitch error visibility on B200 stops at FM/NVLSM syslog lines.
+    # The -cuda12 variant matches our R595 driver branch (R595 → CUDA 13
+    # toolkit, but DCGM-cuda12 is forward-compatible per NVIDIA's matrix).
+    datacenter-gpu-manager-4-cuda12
+    datacenter-gpu-manager-4-config
+    # lshw provides `lshw` and `vpddecode`. Per FM User Guide §"Additional
+    # Steps for NVIDIA HGX B200/B300 Systems": "Query the VPD information
+    # using the lspci -vvs or vpddecode command and identify the four PF
+    # functions you want." Used to distinguish CX7 NVLink-management
+    # bridge LPFs from CX7 NICs by their SMDL=SW_MNG VPD field. Today we
+    # rely on `lspci -vvs` (pciutils, already installed); having
+    # `vpddecode` available makes ad-hoc triage faster when the BDF
+    # discovery path needs hand-debugging.
+    lshw
+    docker.io
+    python3-minimal
+    python3-pip
+    curl
+    iputils-ping
+    pciutils
+    libcurl4t64
+    libxml2
+    libxmlsec1-openssl
+    pciutils
 
 RemoveFiles=/etc/issue
 RemoveFiles=/etc/issue.net

diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.repart-debug/10-root.conf b/src/cloud-api-adaptor/podvm-mkosi/mkosi.presets/system/mkosi.repart-debug/10-root.conf
@@ -2,4 +2,12 @@
 Type=root
 Format=ext4
 CopyFiles=/
-Minimize=guess
+# `Minimize=guess` chronically under-sizes the debug rootfs once the
+# NVIDIA 595 stack (driver-open + fabricmanager + nscq + persistenced +
+# nv-attestation-sdk) lands in /usr — the build either fails with
+# `no space left on device` mid-mkosi or produces a qcow2 whose
+# rootfs runs out of space at first boot. Pin to a fixed 12 GiB so
+# the debug variant has headroom for the full B200 CC userspace.
+Minimize=off
+SizeMinBytes=12G
+SizeMaxBytes=12G
diff --git a/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/etc/modules-load.d/nvlink-fabric.conf b/src/cloud-api-adaptor/podvm-mkosi/mkosi.skeleton/etc/modules-load.d/nvlink-fabric.conf
@@ -0,0 +1,3 @@
+# Required by nv-fabricmanager NVL5+ subnet management path
+# (nvidia-fabricmanager-start.sh checks lsmod for ib_umad and exits if missing)
+ib_umad
diff --git a/...m-mkosi/mkosi.skeleton/usr/lib/systemd/system/nvidia-persistenced.service.d/override.conf b/...m-mkosi/mkosi.skeleton/usr/lib/systemd/system/nvidia-persistenced.service.d/override.conf
@@ -3,6 +3,10 @@ After=nvidia-fabricmanager.service
 
 [Service]
 ExecCondition=/usr/local/bin/check-nvidia-gpu
+# Block daemon startup until every visible GPU has fabric.state=Completed.
+# See /usr/local/bin/wait-nvlink-fabric.sh for the rationale (B200 CC race).
+ExecStartPre=/usr/local/bin/wait-nvlink-fabric.sh
+
 ExecStart=
 ExecStart=/usr/bin/nvidia-persistenced --user nvidia-persistenced --uvm-persistence-mode --verbose
 ExecStartPost=/usr/bin/nvidia-smi conf-compute -srs 1