From 2d808338c5d379a4b5e4bdbf9a653e3c73f2e8b0 Mon Sep 17 00:00:00 2001
From: Alhassan Khedr <alhassan.khedr@cohere.com>
Date: Thu, 14 May 2026 14:20:39 -0400
Subject: [PATCH 1/8] ci: build NVIDIA GPU confidential Kata UVM image from
 source
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a workflow that builds the kata-containers
nvidia-gpu-confidential UVM image with our cohere-fork
guest-components (attestation-agent + api-server-rest) baked in
*at compile time*, instead of post-hoc patching the stock NVIDIA
image with `losetup` + `veritysetup format` (which is what
fortress/scratch/oci-b200/k8s/06-patch-uvm.sh has been doing).

Mechanics:
  1. Check out kata-containers @ inputs.kata_ref (default 3.30.0).
  2. Rewrite versions.yaml: point externals.coco-guest-components.url
     and .version at cohere-ai/guest-components @ <gc_ref> (resolved
     to a SHA via git ls-remote so the build is reproducible).
  3. `make rootfs-image-nvidia-gpu-confidential-tarball` — kata's
     existing build infrastructure clones our fork into the
     coco-guest-components builder container, statically builds AA +
     api-server-rest + CDH, and nvidia_rootfs.sh::coco_guest_components()
     copies them into the rootfs at /usr/local/bin/. From there the
     standard rootfs assembly + dm-verity formatting runs unchanged.
  4. Extract the .image and root_hash file from the tarball, surface
     dm-verity params (root_hash, salt, data_blocks, block sizes) and
     the image sha256 as a measurements.json layer.
  5. zstd -19 the .image, push to GHCR via oras as a 3-layer artifact
     with annotations covering build provenance + verity params.
  6. SLSA build provenance attestation.

Output: ghcr.io/cohere-ai/cloud-api-adaptor/kata-uvm-nvidia-gpu-confidential:<tag>
where <tag> is `cohere-latest` for branch pushes, `kata-${kata_ref}-gc-${gc_ref}`
for workflow_dispatch, or the literal tag for `kata-uvm-v*` tag pushes.

Companion host-side install script lives at
fortress/scratch/oci-b200/k8s/08-install-uvm.sh: it pulls this artifact,
verifies sha256 against measurements.json, and rewrites
kernel_verity_params in the kata config from the manifest. No host
veritysetup needed.

NOTE: this commit also temporarily adds
`alhassankhedr/build-kata-uvm-cohere` to `on.push.branches` so we can
validate end-to-end on the PR branch before merge. That entry must be
removed before this lands on cohere.
---
 .github/workflows/build-kata-uvm-cohere.yaml | 382 +++++++++++++++++++
 1 file changed, 382 insertions(+)
 create mode 100644 .github/workflows/build-kata-uvm-cohere.yaml
diff --git a/.github/workflows/build-kata-uvm-cohere.yaml b/.github/workflows/build-kata-uvm-cohere.yaml
new file mode 100644
index 0000000000..3f7008ade2
--- /dev/null
+++ b/.github/workflows/build-kata-uvm-cohere.yaml
@@ -0,0 +1,382 @@
+name: Build Kata UVM Image (Cohere NVIDIA GPU Confidential)
+
+# Build the Kata Containers NVIDIA-GPU-confidential UVM image with our
+# attestation-agent + api-server-rest baked in *from source*, instead of
+# post-hoc patching the stock NVIDIA image (which is what
+# fortress/scratch/oci-b200/k8s/06-patch-uvm.sh does).
+#
+# How:
+#   1. Check out kata-containers @ ${kata_ref}.
+#   2. Rewrite versions.yaml: point externals.coco-guest-components.url /
+#      .version at our cohere-ai/guest-components fork. The kata build
+#      driver clones that and statically builds AA + api-server-rest +
+#      CDH. nvidia_rootfs.sh's coco_guest_components() step then copies
+#      those binaries into the final UVM rootfs at /usr/local/bin/.
+#   3. Run `make rootfs-image-nvidia-gpu-confidential-tarball` (which also
+#      builds agent, busybox, pause-image, coco-guest-components, and
+#      kernel-nvidia-gpu under the hood — every dep is containerised by
+#      kata-deploy-binaries-in-docker.sh, so the runner just needs Docker).
+#   4. Extract the .image + root_hash file from the tarball.
+#   5. Push to GHCR as an OCI artifact with the dm-verity params surfaced
+#      as annotations so the host install script can wire kata config
+#      without re-running `veritysetup format`.
+#
+# Output OCI ref:
+#   ghcr.io/${{ github.repository }}/kata-uvm-nvidia-gpu-confidential:<tag>
+#
+# Companion install script (consumes this artifact on a B200 host):
+#   fortress/scratch/oci-b200/k8s/08-install-uvm.sh
+
+on:
+  push:
+    tags: ["kata-uvm-v*"]
+    branches:
+      - "cohere"
+      # TEMPORARY: enable end-to-end validation of the workflow on the
+      # feature branch before merge. Remove this entry as part of the
+      # final review; only `cohere` should remain.
+      - "alhassankhedr/build-kata-uvm-cohere"
+    paths:
+      - ".github/workflows/build-kata-uvm-cohere.yaml"
+  workflow_dispatch:
+    inputs:
+      kata_ref:
+        description: "kata-containers ref to build from (tag, branch, or SHA)"
+        required: false
+        type: string
+        default: "3.30.0"
+      kata_repo:
+        description: "kata-containers repo URL"
+        required: false
+        type: string
+        default: "https://github.com/kata-containers/kata-containers.git"
+      gc_repo:
+        description: "guest-components repo URL"
+        required: false
+        type: string
+        default: "https://github.com/cohere-ai/guest-components.git"
+      gc_ref:
+        description: "guest-components ref (branch, tag, or SHA)"
+        required: false
+        type: string
+        default: "cohere"
+      nvidia_gpu_stack:
+        description: "NVIDIA GPU stack components (driver= is added from versions.yaml)"
+        required: false
+        type: string
+        default: "compute,dcgm,nvswitch"
+      tag_suffix:
+        description: "Optional suffix appended to the OCI tag (e.g. for ad-hoc test builds)"
+        required: false
+        type: string
+        default: ""
+
+permissions:
+  id-token: write
+  attestations: write
+  contents: read
+  packages: write
+
+env:
+  OCI_IMAGE: ghcr.io/${{ github.repository }}/kata-uvm-nvidia-gpu-confidential
+
+jobs:
+  meta:
+    name: Compute metadata
+    runs-on: ubuntu-latest
+    outputs:
+      tag: ${{ steps.compute.outputs.tag }}
+      kata_ref: ${{ steps.compute.outputs.kata_ref }}
+      gc_repo: ${{ steps.compute.outputs.gc_repo }}
+      gc_ref: ${{ steps.compute.outputs.gc_ref }}
+      nvidia_gpu_stack: ${{ steps.compute.outputs.nvidia_gpu_stack }}
+    steps:
+      - name: Compute tag and inputs
+        id: compute
+        env:
+          KATA_REF: ${{ inputs.kata_ref || '3.30.0' }}
+          GC_REPO:  ${{ inputs.gc_repo  || 'https://github.com/cohere-ai/guest-components.git' }}
+          GC_REF:   ${{ inputs.gc_ref   || 'cohere' }}
+          STACK:    ${{ inputs.nvidia_gpu_stack || 'compute,dcgm,nvswitch' }}
+          SUFFIX:   ${{ inputs.tag_suffix || '' }}
+        run: |
+          # Tag pattern:
+          #   kata-uvm-v* push  -> use the tag literal (after stripping `kata-uvm-`)
+          #   workflow_dispatch -> kata-${KATA_REF}-gc-${GC_REF_SHORT}[suffix]
+          #   branch push       -> cohere-latest
+          if [[ "$GITHUB_REF" == refs/tags/kata-uvm-v* ]]; then
+            TAG="${GITHUB_REF#refs/tags/kata-uvm-}"
+          elif [[ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ]]; then
+            GC_SHORT="${GC_REF//\//-}"
+            GC_SHORT="${GC_SHORT:0:12}"
+            TAG="kata-${KATA_REF//\//-}-gc-${GC_SHORT}"
+          else
+            TAG="cohere-latest"
+          fi
+          [ -n "$SUFFIX" ] && TAG="${TAG}-${SUFFIX}"
+          # OCI tags can't have '+' or unbounded length; sanitize.
+          TAG="${TAG//+/-}"
+          {
+            echo "tag=$TAG"
+            echo "kata_ref=$KATA_REF"
+            echo "gc_repo=$GC_REPO"
+            echo "gc_ref=$GC_REF"
+            echo "nvidia_gpu_stack=$STACK"
+          } >> "$GITHUB_OUTPUT"
+
+  build:
+    name: Build kata UVM (nvidia-gpu-confidential)
+    needs: meta
+    runs-on: ubuntu-latest
+    timeout-minutes: 180
+    steps:
+      - name: Free up runner disk space
+        # The kata build pulls a CUDA repo + NVIDIA drivers into a chroot
+        # and a kernel build alongside. Default ubuntu-latest leaves ~14G;
+        # we need ~40G or the rootfs build OOMs the disk.
+        run: |
+          set -eux
+          df -h /
+          sudo rm -rf /usr/local/lib/android /usr/share/dotnet /opt/ghc \
+                      /usr/local/share/boost /opt/hostedtoolcache/CodeQL \
+                      /usr/local/share/powershell /usr/local/share/chromium
+          sudo apt-get purge -y google-cloud-cli azure-cli microsoft-edge-stable \
+                                dotnet-* aspnetcore-* mongodb-* mysql-* 2>/dev/null || true
+          sudo apt-get autoremove -y
+          sudo apt-get clean
+          docker system prune -af --volumes 2>/dev/null || true
+          df -h /
+
+      - name: Install host build dependencies
+        run: |
+          sudo apt-get update -qq
+          sudo apt-get install -y --no-install-recommends \
+            git make curl ca-certificates jq python3 python3-pip
+          # Ensure yq is present (kata's build scripts rely on it).
+          if ! command -v yq >/dev/null 2>&1; then
+            sudo curl -fsSL -o /usr/local/bin/yq \
+              https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64
+            sudo chmod +x /usr/local/bin/yq
+          fi
+          yq --version
+
+      - name: Install ORAS
+        # Pin via fortress/CAA convention: read from caa's versions.yaml so
+        # we stay in lockstep. Fallback to a known-good version if the file
+        # is unavailable for some reason.
+        run: |
+          ORAS_VERSION=1.2.0
+          curl -fsSLO "https://github.com/oras-project/oras/releases/download/v${ORAS_VERSION}/oras_${ORAS_VERSION}_linux_amd64.tar.gz"
+          tar -xzf "oras_${ORAS_VERSION}_linux_amd64.tar.gz" oras
+          sudo mv oras /usr/local/bin/
+          rm -f "oras_${ORAS_VERSION}_linux_amd64.tar.gz"
+          oras version
+
+      - name: Checkout kata-containers @ ${{ needs.meta.outputs.kata_ref }}
+        run: |
+          set -eux
+          git clone --depth 1 --branch "${{ needs.meta.outputs.kata_ref }}" \
+            "${{ inputs.kata_repo || 'https://github.com/kata-containers/kata-containers.git' }}" \
+            /tmp/kata
+          ( cd /tmp/kata && git rev-parse HEAD )
+
+      - name: Override coco-guest-components in versions.yaml
+        # This is the key step: tell kata's coco-guest-components builder
+        # to clone our cohere-ai fork at our chosen ref. Everything
+        # downstream (rootfs assembly, dm-verity, root_hash) is unchanged
+        # and uses these binaries as if they had come from upstream.
+        env:
+          GC_REPO: ${{ needs.meta.outputs.gc_repo }}
+          GC_REF:  ${{ needs.meta.outputs.gc_ref }}
+        run: |
+          set -eux
+          cd /tmp/kata
+          # Resolve gc_ref to a SHA so the build is reproducible. We do
+          # this with `git ls-remote` rather than cloning the whole tree.
+          GC_SHA=$(git ls-remote "${GC_REPO}" "${GC_REF}" | awk '{print $1}' | head -n1)
+          if [[ -z "$GC_SHA" ]]; then
+            # Maybe gc_ref already IS a SHA; let downstream fail loudly if not.
+            GC_SHA="${GC_REF}"
+          fi
+          echo "Resolved guest-components ref ${GC_REF} -> ${GC_SHA}"
+
+          yq -i \
+            ".externals.\"coco-guest-components\".url = \"${GC_REPO}\" |
+             .externals.\"coco-guest-components\".version = \"${GC_SHA}\"" \
+            versions.yaml
+
+          echo "----- updated versions.yaml (coco-guest-components) -----"
+          yq '.externals."coco-guest-components"' versions.yaml
+
+      - name: Build rootfs-image-nvidia-gpu-confidential
+        env:
+          NVIDIA_GPU_STACK: ${{ needs.meta.outputs.nvidia_gpu_stack }}
+        run: |
+          set -eux
+          cd /tmp/kata/tools/packaging/kata-deploy/local-build
+          # `make <variant>-tarball` chains all the Docker-isolated builds
+          # (agent, busybox, pause-image, coco-guest-components,
+          # kernel-nvidia-gpu) before running the rootfs assembly. Each
+          # sub-build runs in its own ephemeral container, so we don't
+          # need to install rust/go/etc on the host.
+          NVIDIA_GPU_STACK="$NVIDIA_GPU_STACK" \
+            make rootfs-image-nvidia-gpu-confidential-tarball
+
+          ls -lh build/
+
+      - name: Extract .image and root_hash from the tarball
+        run: |
+          set -eux
+          cd /tmp/kata/tools/packaging/kata-deploy/local-build/build
+          TARBALL=kata-static-rootfs-image-nvidia-gpu-confidential.tar.zst
+          [[ -f "$TARBALL" ]] || { echo "FATAL: $TARBALL missing"; exit 1; }
+
+          mkdir -p /tmp/uvm-out
+          # Tarball layout:
+          #   ./opt/kata/share/kata-containers/kata-containers-nvidia-gpu-confidential.img
+          #   ./opt/kata/share/kata-containers/root_hash_nvidia-gpu-confidential.txt
+          tar --zstd -xvf "$TARBALL" -C /tmp/uvm-out
+          mv /tmp/uvm-out/opt/kata/share/kata-containers/kata-containers-nvidia-gpu-confidential.img \
+             /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img
+          mv /tmp/uvm-out/opt/kata/share/kata-containers/root_hash_nvidia-gpu-confidential.txt \
+             /tmp/uvm-out/root_hash.txt
+          rm -rf /tmp/uvm-out/opt
+          ls -lh /tmp/uvm-out/
+          echo "----- root_hash.txt -----"
+          cat /tmp/uvm-out/root_hash.txt
+
+      - name: Surface verity params as JSON metadata
+        id: measure
+        # The root_hash.txt file is the source of truth for kata's
+        # `kernel_verity_params` (root_hash, salt, data_blocks, etc).
+        # We re-emit those values as a flat JSON file so the host install
+        # script can parse them without invoking veritysetup.
+        run: |
+          set -eux
+          ROOT_HASH=$(awk -F'=' '/^root_hash=/ {print $2}' /tmp/uvm-out/root_hash.txt)
+          SALT=$(awk -F'=' '/^salt=/ {print $2}' /tmp/uvm-out/root_hash.txt)
+          DATA_BLOCKS=$(awk -F'=' '/^data_blocks=/ {print $2}' /tmp/uvm-out/root_hash.txt)
+          DATA_BLOCK_SIZE=$(awk -F'=' '/^data_block_size=/ {print $2}' /tmp/uvm-out/root_hash.txt)
+          HASH_BLOCK_SIZE=$(awk -F'=' '/^hash_block_size=/ {print $2}' /tmp/uvm-out/root_hash.txt)
+          IMG_SHA256=$(sha256sum /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img | awk '{print $1}')
+          IMG_BYTES=$(stat -c %s /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img)
+
+          jq -n \
+            --arg kata_ref       "${{ needs.meta.outputs.kata_ref }}" \
+            --arg gc_repo        "${{ needs.meta.outputs.gc_repo }}" \
+            --arg gc_ref         "${{ needs.meta.outputs.gc_ref }}" \
+            --arg nvidia_stack   "${{ needs.meta.outputs.nvidia_gpu_stack }}" \
+            --arg root_hash      "$ROOT_HASH" \
+            --arg salt           "$SALT" \
+            --arg data_blocks    "$DATA_BLOCKS" \
+            --arg data_block_sz  "$DATA_BLOCK_SIZE" \
+            --arg hash_block_sz  "$HASH_BLOCK_SIZE" \
+            --arg img_sha256     "$IMG_SHA256" \
+            --arg img_bytes      "$IMG_BYTES" \
+            --arg caa_commit     "$GITHUB_SHA" \
+            --arg build_date     "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+            '{
+              kata_ref: $kata_ref,
+              guest_components: {repo: $gc_repo, ref: $gc_ref},
+              nvidia_gpu_stack: $nvidia_stack,
+              dm_verity: {
+                root_hash:       $root_hash,
+                salt:            $salt,
+                data_blocks:     ($data_blocks    | tonumber),
+                data_block_size: ($data_block_sz  | tonumber),
+                hash_block_size: ($hash_block_sz  | tonumber)
+              },
+              image: {
+                filename: "kata-containers-nvidia-gpu-confidential.img",
+                sha256:   $img_sha256,
+                bytes:    ($img_bytes | tonumber)
+              },
+              source: {caa_commit: $caa_commit, build_date: $build_date}
+            }' > /tmp/uvm-out/measurements.json
+
+          cat /tmp/uvm-out/measurements.json
+          echo "root_hash=$ROOT_HASH" >> "$GITHUB_OUTPUT"
+          echo "img_sha256=$IMG_SHA256" >> "$GITHUB_OUTPUT"
+
+      - name: Compress .image for transport
+        run: |
+          set -eux
+          cd /tmp/uvm-out
+          # The raw .image is ~250 MiB; zstd brings it under 100 MiB which
+          # makes oras push fast on cold registries.
+          zstd -19 --long -T0 --rm kata-containers-nvidia-gpu-confidential.img \
+                                   -o kata-containers-nvidia-gpu-confidential.img.zst
+          ls -lh
+
+      - name: Login to GHCR
+        uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Push artifact to GHCR
+        id: push
+        env:
+          OCI_TAG:    ${{ needs.meta.outputs.tag }}
+          ROOT_HASH:  ${{ steps.measure.outputs.root_hash }}
+          IMG_SHA256: ${{ steps.measure.outputs.img_sha256 }}
+        run: |
+          set -eux
+          OCI_REF="${OCI_IMAGE}:${OCI_TAG}"
+          cd /tmp/uvm-out
+          oras push "$OCI_REF" \
+            kata-containers-nvidia-gpu-confidential.img.zst:application/vnd.cohere.kata-uvm.image+zstd \
+            root_hash.txt:application/vnd.cohere.kata-uvm.verity+plain \
+            measurements.json:application/vnd.cohere.kata-uvm.measurements+json \
+            --annotation "org.opencontainers.image.title=kata-uvm-nvidia-gpu-confidential" \
+            --annotation "org.opencontainers.image.description=Kata Containers NVIDIA GPU confidential UVM image, built from source with cohere-ai/guest-components" \
+            --annotation "org.opencontainers.image.source=https://github.com/${GITHUB_REPOSITORY}" \
+            --annotation "org.opencontainers.image.revision=${GITHUB_SHA}" \
+            --annotation "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
+            --annotation "com.cohere.caa.commit=${GITHUB_SHA}" \
+            --annotation "com.cohere.kata.ref=${{ needs.meta.outputs.kata_ref }}" \
+            --annotation "com.cohere.guest-components.repo=${{ needs.meta.outputs.gc_repo }}" \
+            --annotation "com.cohere.guest-components.ref=${{ needs.meta.outputs.gc_ref }}" \
+            --annotation "com.cohere.kata-uvm.image-sha256=${IMG_SHA256}" \
+            --annotation "com.cohere.kata-uvm.root-hash=${ROOT_HASH}" \
+            --format json > oras-output.json
+
+          cat oras-output.json
+          DIGEST=$(jq -r '.digest' oras-output.json)
+          {
+            echo "digest=$DIGEST"
+            echo "oci_ref=${OCI_REF}@${DIGEST}"
+            echo "oci_tag=$OCI_TAG"
+          } >> "$GITHUB_OUTPUT"
+          echo "Pushed: $OCI_REF @ $DIGEST"
+
+      - name: Attest build provenance
+        uses: actions/attest-build-provenance@a2bbfa25375fe432b6a289bc6b6cd05ecd0c4c32 # v4
+        with:
+          subject-name: ${{ env.OCI_IMAGE }}
+          subject-digest: ${{ steps.push.outputs.digest }}
+          push-to-registry: true
+
+      - name: Job summary
+        run: |
+          {
+            echo "### Kata UVM image built"
+            echo ""
+            echo "| Field | Value |"
+            echo "| --- | --- |"
+            echo "| OCI ref | \`${OCI_IMAGE}:${{ needs.meta.outputs.tag }}\` |"
+            echo "| Digest | \`${{ steps.push.outputs.digest }}\` |"
+            echo "| kata-containers ref | \`${{ needs.meta.outputs.kata_ref }}\` |"
+            echo "| guest-components | \`${{ needs.meta.outputs.gc_repo }}@${{ needs.meta.outputs.gc_ref }}\` |"
+            echo "| NVIDIA stack | \`${{ needs.meta.outputs.nvidia_gpu_stack }}\` |"
+            echo "| root_hash | \`${{ steps.measure.outputs.root_hash }}\` |"
+            echo "| image sha256 | \`${{ steps.measure.outputs.img_sha256 }}\` |"
+            echo ""
+            echo "Install on a B200 host with:"
+            echo ""
+            echo '```bash'
+            echo "ORAS_REF=${OCI_IMAGE}:${{ needs.meta.outputs.tag }} \\"
+            echo "  bash fortress/scratch/oci-b200/k8s/08-install-uvm.sh"
+            echo '```'
+          } >> "$GITHUB_STEP_SUMMARY"

From d309d64c5d72bd2ef68814f6f20dbbab2190f4ac Mon Sep 17 00:00:00 2001
From: Alhassan Khedr <alhassan.khedr@cohere.com>
Date: Thu, 14 May 2026 14:50:58 -0400
Subject: [PATCH 2/8] ci: update install-script references to fortress
 k8s/05-install-uvm.sh

Companion to fortress's k8s/ script reordering. The CI workflow's
header comments and the GHCR step summary now point at the new
numbering (05-install-uvm.sh) and reference the legacy patch path
(08-patch-uvm.sh) by its new number too.
---
 .github/workflows/build-kata-uvm-cohere.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-kata-uvm-cohere.yaml b/.github/workflows/build-kata-uvm-cohere.yaml
index 3f7008ade2..ae2f857efb 100644
--- a/.github/workflows/build-kata-uvm-cohere.yaml
+++ b/.github/workflows/build-kata-uvm-cohere.yaml
@@ -2,8 +2,8 @@ name: Build Kata UVM Image (Cohere NVIDIA GPU Confidential)
 
 # Build the Kata Containers NVIDIA-GPU-confidential UVM image with our
 # attestation-agent + api-server-rest baked in *from source*, instead of
-# post-hoc patching the stock NVIDIA image (which is what
-# fortress/scratch/oci-b200/k8s/06-patch-uvm.sh does).
+# post-hoc patching the stock NVIDIA image (which is what the legacy
+# fortress/scratch/oci-b200/k8s/08-patch-uvm.sh does).
 #
 # How:
 #   1. Check out kata-containers @ ${kata_ref}.
@@ -25,7 +25,7 @@ name: Build Kata UVM Image (Cohere NVIDIA GPU Confidential)
 #   ghcr.io/${{ github.repository }}/kata-uvm-nvidia-gpu-confidential:<tag>
 #
 # Companion install script (consumes this artifact on a B200 host):
-#   fortress/scratch/oci-b200/k8s/08-install-uvm.sh
+#   fortress/scratch/oci-b200/k8s/05-install-uvm.sh
 
 on:
   push:
@@ -377,6 +377,6 @@ jobs:
             echo ""
             echo '```bash'
             echo "ORAS_REF=${OCI_IMAGE}:${{ needs.meta.outputs.tag }} \\"
-            echo "  bash fortress/scratch/oci-b200/k8s/08-install-uvm.sh"
+            echo "  bash fortress/scratch/oci-b200/k8s/05-install-uvm.sh"
             echo '```'
           } >> "$GITHUB_STEP_SUMMARY"

From fccd4d7b48966e3a55b0bf7e4a4c18486d54d644 Mon Sep 17 00:00:00 2001
From: Alhassan Khedr <alhassan.khedr@cohere.com>
Date: Thu, 14 May 2026 15:22:57 -0400
Subject: [PATCH 3/8] fix(ci): auto-pin driver= in NVIDIA_GPU_STACK from kata
 versions.yaml

kata 3.30+ nvidia_chroot.sh runs with set -u and only assigns
driver_version when NVIDIA_GPU_STACK contains a literal `driver=<ver>`
component. Without it the rootfs-assembly stage dies at the very last
step with `driver_version: unbound variable`, after the runner has
already done ~45 minutes of work (agent, busybox, pause-image,
coco-guest-components, kernel-nvidia-gpu).

This is exactly how run 25877534335 failed. Fix: derive the driver
pin from .assets.nvidia.driver.version in kata's own versions.yaml
and prepend driver=<ver> to NVIDIA_GPU_STACK in the build step.
Auto-tracks kata_ref.
---
 .github/workflows/build-kata-uvm-cohere.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/.github/workflows/build-kata-uvm-cohere.yaml b/.github/workflows/build-kata-uvm-cohere.yaml
index ae2f857efb..0bdb91d910 100644
--- a/.github/workflows/build-kata-uvm-cohere.yaml
+++ b/.github/workflows/build-kata-uvm-cohere.yaml
@@ -214,6 +214,22 @@ jobs:
         run: |
           set -eux
           cd /tmp/kata/tools/packaging/kata-deploy/local-build
+          # kata 3.30+ nvidia_chroot.sh runs with `set -u` and only assigns
+          # driver_version when NVIDIA_GPU_STACK contains a literal
+          # `driver=<ver>` (tools/osbuilder/rootfs-builder/nvidia/
+          # nvidia_chroot.sh::install_userspace_components). Without it,
+          # the rootfs-assembly stage dies at the very end with
+          # `driver_version: unbound variable`. Derive the canonical pin
+          # from kata's own versions.yaml so this auto-tracks KATA_REF.
+          if [[ ",${NVIDIA_GPU_STACK}," != *",driver="* ]]; then
+            DRIVER_VER=$(yq '.externals.nvidia.driver.version' /tmp/kata/versions.yaml 2>/dev/null | tr -d '"')
+            if [[ -n "$DRIVER_VER" && "$DRIVER_VER" != "null" ]]; then
+              NVIDIA_GPU_STACK="driver=${DRIVER_VER},${NVIDIA_GPU_STACK}"
+              echo "Prepended driver=${DRIVER_VER} (from versions.yaml) -> ${NVIDIA_GPU_STACK}"
+            else
+              echo "WARN: could not resolve .externals.nvidia.driver.version from /tmp/kata/versions.yaml" >&2
+            fi
+          fi
           # `make <variant>-tarball` chains all the Docker-isolated builds
           # (agent, busybox, pause-image, coco-guest-components,
           # kernel-nvidia-gpu) before running the rootfs assembly. Each

From 55ea6da22b6682c49f0d7969ca2a9c5473eeceae Mon Sep 17 00:00:00 2001
From: Alhassan Khedr <alhassan.khedr@cohere.com>
Date: Thu, 14 May 2026 23:08:44 -0400
Subject: [PATCH 4/8] fix(ci): correctly parse root_hash.txt and dereference
 image symlink

Two bugs in the "Extract" / "Surface verity params" steps that together
caused the workflow to abort with `jq: error ... Expected JSON value
(while parsing '')` (exit 5) and would also have produced a junk
artifact even if jq had not failed:

1. root_hash.txt is a SINGLE comma-separated line written by kata's
   osbuilder, not five newline-separated key=value lines. The previous
   `awk -F'=' '/^salt=/ {print $2}'` parsers therefore returned empty
   strings for everything except root_hash (and even that came out with
   a trailing ",salt"), which crashed jq's `tonumber` on data_blocks.
   Replace with a single comma-split + case dispatch, plus regex
   sanity checks so a future format change fails loudly.

2. The .img inside the tarball is a symlink to the versioned .image
   alongside it. The previous `mv` only relocated the symlink, then
   `rm -rf opt/` deleted the underlying file. Resolve via `readlink -f`
   and `cp` the real file before tearing the directory down. Add a
   minimum-size assertion (>100 MiB) so a dangling symlink is caught
   immediately rather than producing measurements.json with bytes=57.

Also tightens the shell with `set -euxo pipefail` and a `jq -e .`
validation of the produced measurements.json.
---
 .github/workflows/build-kata-uvm-cohere.yaml | 58 ++++++++++++++++----
 1 file changed, 48 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build-kata-uvm-cohere.yaml b/.github/workflows/build-kata-uvm-cohere.yaml
index 0bdb91d910..1da3a8329d 100644
--- a/.github/workflows/build-kata-uvm-cohere.yaml
+++ b/.github/workflows/build-kata-uvm-cohere.yaml
@@ -242,24 +242,34 @@ jobs:
 
       - name: Extract .image and root_hash from the tarball
         run: |
-          set -eux
+          set -euxo pipefail
           cd /tmp/kata/tools/packaging/kata-deploy/local-build/build
           TARBALL=kata-static-rootfs-image-nvidia-gpu-confidential.tar.zst
           [[ -f "$TARBALL" ]] || { echo "FATAL: $TARBALL missing"; exit 1; }
 
           mkdir -p /tmp/uvm-out
-          # Tarball layout:
+          # Tarball layout (the .img entry is a symlink to the
+          # versioned .image alongside it):
           #   ./opt/kata/share/kata-containers/kata-containers-nvidia-gpu-confidential.img
+          #     -> kata-ubuntu-noble-nvidia-gpu-confidential-<ver>.image
+          #   ./opt/kata/share/kata-containers/kata-ubuntu-noble-nvidia-gpu-confidential-<ver>.image
           #   ./opt/kata/share/kata-containers/root_hash_nvidia-gpu-confidential.txt
           tar --zstd -xvf "$TARBALL" -C /tmp/uvm-out
-          mv /tmp/uvm-out/opt/kata/share/kata-containers/kata-containers-nvidia-gpu-confidential.img \
-             /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img
+
+          # Resolve the symlink to the real image and copy it (not move),
+          # so the underlying file survives the `rm -rf opt/` below.
+          KATA_IMG_LINK=/tmp/uvm-out/opt/kata/share/kata-containers/kata-containers-nvidia-gpu-confidential.img
+          KATA_IMG_REAL=$(readlink -f "$KATA_IMG_LINK")
+          [[ -f "$KATA_IMG_REAL" ]] || { echo "FATAL: $KATA_IMG_LINK -> $KATA_IMG_REAL missing"; exit 1; }
+          cp --reflink=auto "$KATA_IMG_REAL" /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img
+
           mv /tmp/uvm-out/opt/kata/share/kata-containers/root_hash_nvidia-gpu-confidential.txt \
              /tmp/uvm-out/root_hash.txt
           rm -rf /tmp/uvm-out/opt
           ls -lh /tmp/uvm-out/
           echo "----- root_hash.txt -----"
           cat /tmp/uvm-out/root_hash.txt
+          file /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img
 
       - name: Surface verity params as JSON metadata
         id: measure
@@ -267,15 +277,42 @@ jobs:
         # `kernel_verity_params` (root_hash, salt, data_blocks, etc).
         # We re-emit those values as a flat JSON file so the host install
         # script can parse them without invoking veritysetup.
+        #
+        # NOTE: kata's osbuilder writes root_hash.txt as a single
+        # comma-separated line, e.g.
+        #   root_hash=<hex>,salt=<hex>,data_blocks=N,data_block_size=4096,hash_block_size=4096
+        # so we split on commas first, then on '=' to populate each var.
         run: |
-          set -eux
-          ROOT_HASH=$(awk -F'=' '/^root_hash=/ {print $2}' /tmp/uvm-out/root_hash.txt)
-          SALT=$(awk -F'=' '/^salt=/ {print $2}' /tmp/uvm-out/root_hash.txt)
-          DATA_BLOCKS=$(awk -F'=' '/^data_blocks=/ {print $2}' /tmp/uvm-out/root_hash.txt)
-          DATA_BLOCK_SIZE=$(awk -F'=' '/^data_block_size=/ {print $2}' /tmp/uvm-out/root_hash.txt)
-          HASH_BLOCK_SIZE=$(awk -F'=' '/^hash_block_size=/ {print $2}' /tmp/uvm-out/root_hash.txt)
+          set -euxo pipefail
+
+          ROOT_HASH=""; SALT=""; DATA_BLOCKS=""
+          DATA_BLOCK_SIZE=""; HASH_BLOCK_SIZE=""
+          while IFS='=' read -r k v; do
+            case "$k" in
+              root_hash)       ROOT_HASH=$v ;;
+              salt)            SALT=$v ;;
+              data_blocks)     DATA_BLOCKS=$v ;;
+              data_block_size) DATA_BLOCK_SIZE=$v ;;
+              hash_block_size) HASH_BLOCK_SIZE=$v ;;
+            esac
+          done < <(tr ',' '\n' < /tmp/uvm-out/root_hash.txt)
+
+          # Fail loudly on parse regressions instead of producing a junk
+          # measurements.json with empty fields.
+          : "${ROOT_HASH:?root_hash missing from root_hash.txt}"
+          : "${SALT:?salt missing from root_hash.txt}"
+          : "${DATA_BLOCKS:?data_blocks missing from root_hash.txt}"
+          : "${DATA_BLOCK_SIZE:?data_block_size missing from root_hash.txt}"
+          : "${HASH_BLOCK_SIZE:?hash_block_size missing from root_hash.txt}"
+          [[ "$ROOT_HASH" =~ ^[0-9a-f]{64}$ ]] || { echo "bad root_hash: $ROOT_HASH"; exit 1; }
+          [[ "$SALT"      =~ ^[0-9a-f]{64}$ ]] || { echo "bad salt: $SALT"; exit 1; }
+
           IMG_SHA256=$(sha256sum /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img | awk '{print $1}')
           IMG_BYTES=$(stat -c %s /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img)
+          [[ "$IMG_SHA256" =~ ^[0-9a-f]{64}$ ]] || { echo "bad image sha256: $IMG_SHA256"; exit 1; }
+          # GPU UVM is always hundreds of MB; anything tiny means we measured
+          # a dangling symlink or an empty file.
+          [[ "$IMG_BYTES" -gt 104857600 ]] || { echo "image suspiciously small: $IMG_BYTES bytes"; exit 1; }
 
           jq -n \
             --arg kata_ref       "${{ needs.meta.outputs.kata_ref }}" \
@@ -310,6 +347,7 @@ jobs:
               source: {caa_commit: $caa_commit, build_date: $build_date}
             }' > /tmp/uvm-out/measurements.json
 
+          jq -e . /tmp/uvm-out/measurements.json >/dev/null
           cat /tmp/uvm-out/measurements.json
           echo "root_hash=$ROOT_HASH" >> "$GITHUB_OUTPUT"
           echo "img_sha256=$IMG_SHA256" >> "$GITHUB_OUTPUT"

From b9c89df4f0e1e4d219f91989714f0a9e968aa740 Mon Sep 17 00:00:00 2001
From: Alhassan Khedr <alhassan.khedr@cohere.com>
Date: Thu, 14 May 2026 23:14:23 -0400
Subject: [PATCH 5/8] ci: add kata_nvidia_driver_ver input to override the
 pinned NVIDIA driver

Kata 3.30.0 pins driver=595.58.03 in versions.yaml, but on 8x B200 OCI
hosts that driver hits a fabric-probe race where RmGpuFabricProbe times
out and fail-stops GPU init. The fix landed in 595.71.05 (which is also
the version present in the working mkosi-built images).

This adds an optional workflow_dispatch input `kata_nvidia_driver_ver`.
When set (e.g. to 595.71.05), the build:

- Rewrites .externals.nvidia.driver.version in kata's versions.yaml
  before the rootfs build, so the pin flows through to both
  open-gpu-kernel-modules (cloned from the GitHub tag) and the
  nvidia-driver-pinning-<ver> apt package.
- Surfaces the override in the OCI tag (kata-...-drv-<ver>), the
  com.cohere.kata-uvm.nvidia-driver annotation, measurements.json's
  new nvidia_driver.version field, and the job summary.

When unset, the build behaves exactly as before. measurements.json
always reflects the *actually baked-in* driver (read from the
post-rewrite versions.yaml) rather than the requested input, so it
stays truthful when the override is empty.

Mirrors the same mechanic in fortress/scratch/oci-b200/k8s/04-build-uvm-locally.sh.
---
 .github/workflows/build-kata-uvm-cohere.yaml | 63 ++++++++++++++++++--
 1 file changed, 57 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build-kata-uvm-cohere.yaml b/.github/workflows/build-kata-uvm-cohere.yaml
index 1da3a8329d..af8fd36aa8 100644
--- a/.github/workflows/build-kata-uvm-cohere.yaml
+++ b/.github/workflows/build-kata-uvm-cohere.yaml
@@ -60,6 +60,17 @@ on:
         required: false
         type: string
         default: "cohere"
+      kata_nvidia_driver_ver:
+        description: |
+          Override .externals.nvidia.driver.version in kata's versions.yaml
+          (e.g. 595.71.05). Leave empty to use kata's default. The tag must
+          exist at https://github.com/NVIDIA/open-gpu-kernel-modules and a
+          matching nvidia-driver-pinning-<ver> package must exist in the
+          NVIDIA CUDA apt repo. Required to fix the 8x B200 fabric-probe
+          race in kata 3.30.0's default 595.58.03 driver pin.
+        required: false
+        type: string
+        default: ""
       nvidia_gpu_stack:
         description: "NVIDIA GPU stack components (driver= is added from versions.yaml)"
         required: false
@@ -89,20 +100,22 @@ jobs:
       kata_ref: ${{ steps.compute.outputs.kata_ref }}
       gc_repo: ${{ steps.compute.outputs.gc_repo }}
       gc_ref: ${{ steps.compute.outputs.gc_ref }}
+      kata_nvidia_driver_ver: ${{ steps.compute.outputs.kata_nvidia_driver_ver }}
       nvidia_gpu_stack: ${{ steps.compute.outputs.nvidia_gpu_stack }}
     steps:
       - name: Compute tag and inputs
         id: compute
         env:
-          KATA_REF: ${{ inputs.kata_ref || '3.30.0' }}
-          GC_REPO:  ${{ inputs.gc_repo  || 'https://github.com/cohere-ai/guest-components.git' }}
-          GC_REF:   ${{ inputs.gc_ref   || 'cohere' }}
-          STACK:    ${{ inputs.nvidia_gpu_stack || 'compute,dcgm,nvswitch' }}
-          SUFFIX:   ${{ inputs.tag_suffix || '' }}
+          KATA_REF:       ${{ inputs.kata_ref || '3.30.0' }}
+          GC_REPO:        ${{ inputs.gc_repo  || 'https://github.com/cohere-ai/guest-components.git' }}
+          GC_REF:         ${{ inputs.gc_ref   || 'cohere' }}
+          DRIVER_VER:     ${{ inputs.kata_nvidia_driver_ver || '' }}
+          STACK:          ${{ inputs.nvidia_gpu_stack || 'compute,dcgm,nvswitch' }}
+          SUFFIX:         ${{ inputs.tag_suffix || '' }}
         run: |
           # Tag pattern:
           #   kata-uvm-v* push  -> use the tag literal (after stripping `kata-uvm-`)
-          #   workflow_dispatch -> kata-${KATA_REF}-gc-${GC_REF_SHORT}[suffix]
+          #   workflow_dispatch -> kata-${KATA_REF}-gc-${GC_REF_SHORT}[-drv-<ver>][suffix]
           #   branch push       -> cohere-latest
           if [[ "$GITHUB_REF" == refs/tags/kata-uvm-v* ]]; then
             TAG="${GITHUB_REF#refs/tags/kata-uvm-}"
@@ -110,6 +123,10 @@ jobs:
             GC_SHORT="${GC_REF//\//-}"
             GC_SHORT="${GC_SHORT:0:12}"
             TAG="kata-${KATA_REF//\//-}-gc-${GC_SHORT}"
+            # If the caller overrode the NVIDIA driver pin, surface it in the
+            # OCI tag so the artifact name unambiguously identifies which
+            # driver is baked in.
+            [ -n "$DRIVER_VER" ] && TAG="${TAG}-drv-${DRIVER_VER}"
           else
             TAG="cohere-latest"
           fi
@@ -121,6 +138,7 @@ jobs:
             echo "kata_ref=$KATA_REF"
             echo "gc_repo=$GC_REPO"
             echo "gc_ref=$GC_REF"
+            echo "kata_nvidia_driver_ver=$DRIVER_VER"
             echo "nvidia_gpu_stack=$STACK"
           } >> "$GITHUB_OUTPUT"
 
@@ -208,6 +226,27 @@ jobs:
           echo "----- updated versions.yaml (coco-guest-components) -----"
           yq '.externals."coco-guest-components"' versions.yaml
 
+      - name: Override NVIDIA driver pin in versions.yaml
+        # kata 3.30.0's versions.yaml pins driver=595.58.03, but on 8x B200
+        # OCI hosts that driver hits a fabric-probe race (kernel timeout in
+        # RmGpuFabricProbe -> fail-stop). The fix landed in 595.71.05. The
+        # build pulls open kernel modules from
+        # https://github.com/NVIDIA/open-gpu-kernel-modules tags, and the
+        # userspace via nvidia-driver-pinning-<ver> from the NVIDIA CUDA
+        # apt repo, so any version that exists in both places is a valid
+        # override. Skipped when input is empty.
+        if: needs.meta.outputs.kata_nvidia_driver_ver != ''
+        env:
+          DRIVER_VER: ${{ needs.meta.outputs.kata_nvidia_driver_ver }}
+        run: |
+          set -eux
+          cd /tmp/kata
+          OLD_VER=$(yq '.externals.nvidia.driver.version' versions.yaml | tr -d '"')
+          yq -i ".externals.nvidia.driver.version = \"${DRIVER_VER}\"" versions.yaml
+          echo "NVIDIA driver pin: ${OLD_VER} -> ${DRIVER_VER}"
+          echo "----- updated versions.yaml (nvidia.driver) -----"
+          yq '.externals.nvidia.driver' versions.yaml
+
       - name: Build rootfs-image-nvidia-gpu-confidential
         env:
           NVIDIA_GPU_STACK: ${{ needs.meta.outputs.nvidia_gpu_stack }}
@@ -314,10 +353,17 @@ jobs:
           # a dangling symlink or an empty file.
           [[ "$IMG_BYTES" -gt 104857600 ]] || { echo "image suspiciously small: $IMG_BYTES bytes"; exit 1; }
 
+          # Resolve the *actual* baked-in driver pin from versions.yaml so
+          # the artifact reports what is really installed, not just what
+          # was requested. (When kata_nvidia_driver_ver is empty we want
+          # kata's default to be reflected here.)
+          DRIVER_VER=$(yq '.externals.nvidia.driver.version' /tmp/kata/versions.yaml | tr -d '"')
+
           jq -n \
             --arg kata_ref       "${{ needs.meta.outputs.kata_ref }}" \
             --arg gc_repo        "${{ needs.meta.outputs.gc_repo }}" \
             --arg gc_ref         "${{ needs.meta.outputs.gc_ref }}" \
+            --arg driver_ver     "$DRIVER_VER" \
             --arg nvidia_stack   "${{ needs.meta.outputs.nvidia_gpu_stack }}" \
             --arg root_hash      "$ROOT_HASH" \
             --arg salt           "$SALT" \
@@ -331,6 +377,7 @@ jobs:
             '{
               kata_ref: $kata_ref,
               guest_components: {repo: $gc_repo, ref: $gc_ref},
+              nvidia_driver: {version: $driver_ver},
               nvidia_gpu_stack: $nvidia_stack,
               dm_verity: {
                 root_hash:       $root_hash,
@@ -351,6 +398,7 @@ jobs:
           cat /tmp/uvm-out/measurements.json
           echo "root_hash=$ROOT_HASH" >> "$GITHUB_OUTPUT"
           echo "img_sha256=$IMG_SHA256" >> "$GITHUB_OUTPUT"
+          echo "driver_ver=$DRIVER_VER" >> "$GITHUB_OUTPUT"
 
       - name: Compress .image for transport
         run: |
@@ -375,6 +423,7 @@ jobs:
           OCI_TAG:    ${{ needs.meta.outputs.tag }}
           ROOT_HASH:  ${{ steps.measure.outputs.root_hash }}
           IMG_SHA256: ${{ steps.measure.outputs.img_sha256 }}
+          DRIVER_VER: ${{ steps.measure.outputs.driver_ver }}
         run: |
           set -eux
           OCI_REF="${OCI_IMAGE}:${OCI_TAG}"
@@ -392,6 +441,7 @@ jobs:
             --annotation "com.cohere.kata.ref=${{ needs.meta.outputs.kata_ref }}" \
             --annotation "com.cohere.guest-components.repo=${{ needs.meta.outputs.gc_repo }}" \
             --annotation "com.cohere.guest-components.ref=${{ needs.meta.outputs.gc_ref }}" \
+            --annotation "com.cohere.kata-uvm.nvidia-driver=${DRIVER_VER}" \
             --annotation "com.cohere.kata-uvm.image-sha256=${IMG_SHA256}" \
             --annotation "com.cohere.kata-uvm.root-hash=${ROOT_HASH}" \
             --format json > oras-output.json
@@ -423,6 +473,7 @@ jobs:
             echo "| Digest | \`${{ steps.push.outputs.digest }}\` |"
             echo "| kata-containers ref | \`${{ needs.meta.outputs.kata_ref }}\` |"
             echo "| guest-components | \`${{ needs.meta.outputs.gc_repo }}@${{ needs.meta.outputs.gc_ref }}\` |"
+            echo "| NVIDIA driver | \`${{ steps.measure.outputs.driver_ver }}\` |"
             echo "| NVIDIA stack | \`${{ needs.meta.outputs.nvidia_gpu_stack }}\` |"
             echo "| root_hash | \`${{ steps.measure.outputs.root_hash }}\` |"
             echo "| image sha256 | \`${{ steps.measure.outputs.img_sha256 }}\` |"

From 6b51099ac1e49c3f4c404381ee8863403b112e25 Mon Sep 17 00:00:00 2001
From: Alhassan Khedr <alhassan.khedr@cohere.com>
Date: Thu, 14 May 2026 23:58:58 -0400
Subject: [PATCH 6/8] ci: default gc_ref to alhassankhedr/sync-main-to-cohere
 (PR #9)

The plain `cohere` branch of guest-components has a `count == 1` guard
in `nvidia-attester::detect_platform()` that silently disables the
attester on multi-GPU systems. Multi-GPU pods on 8x B200 boot fine but
`/aa/additional_evidence` returns empty, which looks like a build issue
but is actually the userspace attester refusing to register.

Upstream main has a complete rewrite of nvidia-attester on top of the
NVAT SDK (no `count == 1` check). PR #9 in cohere-ai/guest-components
syncs that rewrite into our fork. Until PR #9 merges into `cohere`,
default `gc_ref` to `alhassankhedr/sync-main-to-cohere` so kata UVM
builds out of this workflow have a working multi-GPU attester.

Switch back to `cohere` once PR #9 is merged.
---
 .github/workflows/build-kata-uvm-cohere.yaml | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-kata-uvm-cohere.yaml b/.github/workflows/build-kata-uvm-cohere.yaml
index af8fd36aa8..4e2143c1f3 100644
--- a/.github/workflows/build-kata-uvm-cohere.yaml
+++ b/.github/workflows/build-kata-uvm-cohere.yaml
@@ -56,10 +56,20 @@ on:
         type: string
         default: "https://github.com/cohere-ai/guest-components.git"
       gc_ref:
-        description: "guest-components ref (branch, tag, or SHA)"
+        description: |
+          guest-components ref (branch, tag, or SHA).
+
+          Default: alhassankhedr/sync-main-to-cohere (head of PR #9).
+          That branch carries upstream main's nvidia-attester rewrite
+          (NVAT SDK based, no `count == 1` guard) and is required for
+          multi-GPU evidence to work end-to-end on 8x B200 hosts. The
+          plain `cohere` branch still has the old NVML-based attester
+          which silently produces empty evidence on 2+ GPU systems
+          (mod a sed `s/count == 1/count >= 1/` patch the podvm-mkosi
+          Dockerfile applies). Switch back to `cohere` after PR #9 merges.
         required: false
         type: string
-        default: "cohere"
+        default: "alhassankhedr/sync-main-to-cohere"
       kata_nvidia_driver_ver:
         description: |
           Override .externals.nvidia.driver.version in kata's versions.yaml
@@ -108,7 +118,7 @@ jobs:
         env:
           KATA_REF:       ${{ inputs.kata_ref || '3.30.0' }}
           GC_REPO:        ${{ inputs.gc_repo  || 'https://github.com/cohere-ai/guest-components.git' }}
-          GC_REF:         ${{ inputs.gc_ref   || 'cohere' }}
+          GC_REF:         ${{ inputs.gc_ref   || 'alhassankhedr/sync-main-to-cohere' }}
           DRIVER_VER:     ${{ inputs.kata_nvidia_driver_ver || '' }}
           STACK:          ${{ inputs.nvidia_gpu_stack || 'compute,dcgm,nvswitch' }}
           SUFFIX:         ${{ inputs.tag_suffix || '' }}

From d5e0166804292fc5da300cf1979e5c6398006c1f Mon Sep 17 00:00:00 2001
From: Alhassan Khedr <alhassan.khedr@cohere.com>
Date: Fri, 15 May 2026 00:26:24 -0400
Subject: [PATCH 7/8] ci: pin nvidia.nvat.version so attestation-agent-nv
 actually gets built
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a `kata_nvat_ver` workflow_dispatch input (default 2026.03.02) that
rewrites `.externals.nvidia.nvat.{version,url,desc}` in kata's
versions.yaml before the rootfs build.

Why this matters: kata's
tools/packaging/static-build/coco-guest-components/build.sh forwards
NVAT_VERSION from versions.yaml to the GC builder Dockerfile. The
Dockerfile gates the entire libnvat clone+cmake+install behind
`if [ -n "${NVAT_VERSION}" ]`, and upstream kata 3.30.0 ships *without*
that key set. Net effect on the cohere fork's UVM:

* libnvat is never built into the GC builder image.
* build-static-coco-guest-components.sh's second AA build pass — the
  one that compiles `attestation-agent` with `nvidia-attester` against
  /usr/local/lib/libnvat.so and installs the result as
  /usr/local/bin/attestation-agent-nv — silently no-ops because the
  required system lib is missing.
* The rootfs ends up with only the standard, non-NVIDIA AA. Symbol
  fingerprint of the installed UVM confirms it: zero `nvmlDeviceGetCount`,
  zero `nv_attestation_sdk`, zero `libnvat`.
* `/aa/additional_evidence` returns empty on multi-GPU pods regardless
  of which guest-components branch we baked. ITA appraisal can never
  see `nvgpu_overall: true`.

Pins 2026.03.02 to match the version the podvm-mkosi side already
builds against (NVAT_TAG in cloud-api-adaptor's
Dockerfile.podvm_binaries.ubuntu).

Tag, measurements.json, and OCI annotations all surface the pin so the
binding is inspectable from the registry (`-nvat-<ver>` tag suffix,
`nvat_sdk.version` field, `com.cohere.kata-uvm.nvat-sdk` annotation).
---
 .github/workflows/build-kata-uvm-cohere.yaml | 82 ++++++++++++++++++--
 1 file changed, 77 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build-kata-uvm-cohere.yaml b/.github/workflows/build-kata-uvm-cohere.yaml
index 4e2143c1f3..8099af0be4 100644
--- a/.github/workflows/build-kata-uvm-cohere.yaml
+++ b/.github/workflows/build-kata-uvm-cohere.yaml
@@ -81,6 +81,24 @@ on:
         required: false
         type: string
         default: ""
+      kata_nvat_ver:
+        description: |
+          Pin .externals.nvidia.nvat.version in kata's versions.yaml. Default
+          2026.03.02 (a real tag in https://github.com/NVIDIA/attestation-sdk).
+
+          Without this pin, kata's coco-guest-components builder Dockerfile
+          skips the libnvat build (its `if [ -n "${NVAT_VERSION}" ]` guard
+          short-circuits), which makes the second AA build pass — the one
+          that links the nvidia-attester cargo feature against
+          /usr/local/lib/libnvat.so and installs the result as
+          /usr/local/bin/attestation-agent-nv — silently no-op. Net effect:
+          the AA baked into the rootfs has no GPU evidence support
+          regardless of gc_ref, and /aa/additional_evidence on multi-GPU
+          pods returns empty. Set "" to leave nvat unpinned (matches
+          upstream kata 3.30.0 behaviour).
+        required: false
+        type: string
+        default: "2026.03.02"
       nvidia_gpu_stack:
         description: "NVIDIA GPU stack components (driver= is added from versions.yaml)"
         required: false
@@ -111,6 +129,7 @@ jobs:
       gc_repo: ${{ steps.compute.outputs.gc_repo }}
       gc_ref: ${{ steps.compute.outputs.gc_ref }}
       kata_nvidia_driver_ver: ${{ steps.compute.outputs.kata_nvidia_driver_ver }}
+      kata_nvat_ver: ${{ steps.compute.outputs.kata_nvat_ver }}
       nvidia_gpu_stack: ${{ steps.compute.outputs.nvidia_gpu_stack }}
     steps:
       - name: Compute tag and inputs
@@ -120,12 +139,13 @@ jobs:
           GC_REPO:        ${{ inputs.gc_repo  || 'https://github.com/cohere-ai/guest-components.git' }}
           GC_REF:         ${{ inputs.gc_ref   || 'alhassankhedr/sync-main-to-cohere' }}
           DRIVER_VER:     ${{ inputs.kata_nvidia_driver_ver || '' }}
+          NVAT_VER:       ${{ inputs.kata_nvat_ver || '2026.03.02' }}
           STACK:          ${{ inputs.nvidia_gpu_stack || 'compute,dcgm,nvswitch' }}
           SUFFIX:         ${{ inputs.tag_suffix || '' }}
         run: |
           # Tag pattern:
           #   kata-uvm-v* push  -> use the tag literal (after stripping `kata-uvm-`)
-          #   workflow_dispatch -> kata-${KATA_REF}-gc-${GC_REF_SHORT}[-drv-<ver>][suffix]
+          #   workflow_dispatch -> kata-${KATA_REF}-gc-${GC_REF_SHORT}[-drv-<ver>][-nvat-<ver>][suffix]
           #   branch push       -> cohere-latest
           if [[ "$GITHUB_REF" == refs/tags/kata-uvm-v* ]]; then
             TAG="${GITHUB_REF#refs/tags/kata-uvm-}"
@@ -137,6 +157,10 @@ jobs:
             # OCI tag so the artifact name unambiguously identifies which
             # driver is baked in.
             [ -n "$DRIVER_VER" ] && TAG="${TAG}-drv-${DRIVER_VER}"
+            # Same for the NVAT SDK pin: the artifact ABI changes meaningfully
+            # between NVAT releases (libnvat soname + GpuEvidenceSource API),
+            # so make this visible too.
+            [ -n "$NVAT_VER" ]   && TAG="${TAG}-nvat-${NVAT_VER}"
           else
             TAG="cohere-latest"
           fi
@@ -149,6 +173,7 @@ jobs:
             echo "gc_repo=$GC_REPO"
             echo "gc_ref=$GC_REF"
             echo "kata_nvidia_driver_ver=$DRIVER_VER"
+            echo "kata_nvat_ver=$NVAT_VER"
             echo "nvidia_gpu_stack=$STACK"
           } >> "$GITHUB_OUTPUT"
 
@@ -257,6 +282,40 @@ jobs:
           echo "----- updated versions.yaml (nvidia.driver) -----"
           yq '.externals.nvidia.driver' versions.yaml
 
+      - name: Pin NVIDIA Attestation SDK (libnvat) in versions.yaml
+        # kata's tools/packaging/static-build/coco-guest-components/build.sh
+        # reads `.externals.nvidia.nvat.version` and forwards it to the GC
+        # builder Dockerfile as NVAT_VERSION. The Dockerfile gates the entire
+        # libnvat clone+cmake+install behind `if [ -n "${NVAT_VERSION}" ]`,
+        # so an unset key (the default in upstream kata 3.30.0) means no
+        # libnvat in the builder image. Without libnvat,
+        # build-static-coco-guest-components.sh's second build pass — the one
+        # that compiles AA with `nvidia-attester` against
+        # /usr/local/lib/libnvat.so and installs the result as
+        # /usr/local/bin/attestation-agent-nv — silently no-ops, and the
+        # rootfs ends up with only the no-NVIDIA AA. /aa/additional_evidence
+        # then returns empty regardless of which guest-components we baked.
+        # Pin a real attestation-sdk tag here so libnvat actually gets built.
+        if: needs.meta.outputs.kata_nvat_ver != ''
+        env:
+          NVAT_VER: ${{ needs.meta.outputs.kata_nvat_ver }}
+        run: |
+          set -eux
+          cd /tmp/kata
+          OLD_VER=$(yq '.externals.nvidia.nvat.version // ""' versions.yaml | tr -d '"')
+          yq -i \
+            ".externals.nvidia.nvat.version = \"${NVAT_VER}\" |
+             .externals.nvidia.nvat.url     = \"https://github.com/NVIDIA/attestation-sdk\" |
+             .externals.nvidia.nvat.desc    = \"NVIDIA Attestation SDK (libnvat); enables attestation-agent-nv\"" \
+            versions.yaml
+          if [[ -z "$OLD_VER" || "$OLD_VER" == "null" ]]; then
+            echo "NVAT SDK pin: (unset) -> ${NVAT_VER}"
+          else
+            echo "NVAT SDK pin: ${OLD_VER} -> ${NVAT_VER}"
+          fi
+          echo "----- updated versions.yaml (nvidia.nvat) -----"
+          yq '.externals.nvidia.nvat' versions.yaml
+
       - name: Build rootfs-image-nvidia-gpu-confidential
         env:
           NVIDIA_GPU_STACK: ${{ needs.meta.outputs.nvidia_gpu_stack }}
@@ -363,17 +422,19 @@ jobs:
           # a dangling symlink or an empty file.
           [[ "$IMG_BYTES" -gt 104857600 ]] || { echo "image suspiciously small: $IMG_BYTES bytes"; exit 1; }
 
-          # Resolve the *actual* baked-in driver pin from versions.yaml so
-          # the artifact reports what is really installed, not just what
-          # was requested. (When kata_nvidia_driver_ver is empty we want
-          # kata's default to be reflected here.)
+          # Resolve the *actual* baked-in driver / nvat pins from versions.yaml
+          # so the artifact reports what is really installed, not just what
+          # was requested. (When the kata_* inputs are empty we want kata's
+          # default to be reflected here.)
           DRIVER_VER=$(yq '.externals.nvidia.driver.version' /tmp/kata/versions.yaml | tr -d '"')
+          NVAT_VER=$(yq '.externals.nvidia.nvat.version // ""' /tmp/kata/versions.yaml | tr -d '"')
 
           jq -n \
             --arg kata_ref       "${{ needs.meta.outputs.kata_ref }}" \
             --arg gc_repo        "${{ needs.meta.outputs.gc_repo }}" \
             --arg gc_ref         "${{ needs.meta.outputs.gc_ref }}" \
             --arg driver_ver     "$DRIVER_VER" \
+            --arg nvat_ver       "$NVAT_VER" \
             --arg nvidia_stack   "${{ needs.meta.outputs.nvidia_gpu_stack }}" \
             --arg root_hash      "$ROOT_HASH" \
             --arg salt           "$SALT" \
@@ -388,6 +449,8 @@ jobs:
               kata_ref: $kata_ref,
               guest_components: {repo: $gc_repo, ref: $gc_ref},
               nvidia_driver: {version: $driver_ver},
+              nvat_sdk:      ( if $nvat_ver == "" or $nvat_ver == "null" then null
+                               else {version: $nvat_ver} end ),
               nvidia_gpu_stack: $nvidia_stack,
               dm_verity: {
                 root_hash:       $root_hash,
@@ -409,6 +472,7 @@ jobs:
           echo "root_hash=$ROOT_HASH" >> "$GITHUB_OUTPUT"
           echo "img_sha256=$IMG_SHA256" >> "$GITHUB_OUTPUT"
           echo "driver_ver=$DRIVER_VER" >> "$GITHUB_OUTPUT"
+          echo "nvat_ver=$NVAT_VER" >> "$GITHUB_OUTPUT"
 
       - name: Compress .image for transport
         run: |
@@ -434,10 +498,16 @@ jobs:
           ROOT_HASH:  ${{ steps.measure.outputs.root_hash }}
           IMG_SHA256: ${{ steps.measure.outputs.img_sha256 }}
           DRIVER_VER: ${{ steps.measure.outputs.driver_ver }}
+          NVAT_VER:   ${{ steps.measure.outputs.nvat_ver }}
         run: |
           set -eux
           OCI_REF="${OCI_IMAGE}:${OCI_TAG}"
           cd /tmp/uvm-out
+          # NVAT annotation is conditional: an empty string would push a
+          # value-less label which is misleading.
+          NVAT_ANNOTATION=()
+          [[ -n "$NVAT_VER" ]] && NVAT_ANNOTATION+=(--annotation "com.cohere.kata-uvm.nvat-sdk=${NVAT_VER}")
+
           oras push "$OCI_REF" \
             kata-containers-nvidia-gpu-confidential.img.zst:application/vnd.cohere.kata-uvm.image+zstd \
             root_hash.txt:application/vnd.cohere.kata-uvm.verity+plain \
@@ -452,6 +522,7 @@ jobs:
             --annotation "com.cohere.guest-components.repo=${{ needs.meta.outputs.gc_repo }}" \
             --annotation "com.cohere.guest-components.ref=${{ needs.meta.outputs.gc_ref }}" \
             --annotation "com.cohere.kata-uvm.nvidia-driver=${DRIVER_VER}" \
+            "${NVAT_ANNOTATION[@]}" \
             --annotation "com.cohere.kata-uvm.image-sha256=${IMG_SHA256}" \
             --annotation "com.cohere.kata-uvm.root-hash=${ROOT_HASH}" \
             --format json > oras-output.json
@@ -484,6 +555,7 @@ jobs:
             echo "| kata-containers ref | \`${{ needs.meta.outputs.kata_ref }}\` |"
             echo "| guest-components | \`${{ needs.meta.outputs.gc_repo }}@${{ needs.meta.outputs.gc_ref }}\` |"
             echo "| NVIDIA driver | \`${{ steps.measure.outputs.driver_ver }}\` |"
+            echo "| NVAT SDK | \`${{ steps.measure.outputs.nvat_ver || '(unset — attestation-agent-nv NOT built)' }}\` |"
             echo "| NVIDIA stack | \`${{ needs.meta.outputs.nvidia_gpu_stack }}\` |"
             echo "| root_hash | \`${{ steps.measure.outputs.root_hash }}\` |"
             echo "| image sha256 | \`${{ steps.measure.outputs.img_sha256 }}\` |"

From a55b3acb2a395945633422b19bd559087662927f Mon Sep 17 00:00:00 2001
From: Alhassan Khedr <alhassan.khedr@cohere.com>
Date: Fri, 15 May 2026 12:25:39 -0400
Subject: [PATCH 8/8] ci(kata-uvm): ship paired kernel binary alongside rootfs
 to fix Bug F

kata's kernel-nvidia-gpu build emits a fresh random
certs/signing_key.pem per invocation; the NVIDIA modules baked into
kata-static-kernel-nvidia-gpu-modules.tar.zst (and therefore into the
rootfs) are signed against THAT key. If the host launches our UVM
against a kernel from a different build (e.g. the kata-deploy-bundled
one), every NVIDIA .ko is rejected at first modprobe with "Loading of
unsigned module is rejected", NVRC panics in src/execute.rs:24:9, the
guest powers down, and pods sit in Pending forever. Verified
end-to-end on the B200 host on 2026-05-15 (README "Bug F").

The host-side fix lives in fortress's 05-install-uvm.sh, which
atomically installs both the rootfs symlink and the kernel binary. For
that to work, the OCI artifact has to ship the kernel. Mirror the
local build pipeline (04-build-uvm-locally.sh) here:

  * Force a clean kernel + modules + rootfs rebuild whenever
    kata_nvidia_driver_ver is overridden, so kata's make can't reuse a
    cached kernel-nvidia-gpu builddir whose embedded signing key
    doesn't match the new modules tarball.

  * After "Build rootfs", stage the locally-built vmlinuz (+ vmlinux,
    System.map, config) into /tmp/uvm-out alongside the rootfs and
    write kernel.basename as a single source of truth for the install
    side.

  * Add a defensive signing-key sanity check that extracts the SKID
    from kernel-nvidia-gpu/builddir/.../certs/signing_key.x509 and
    confirms it appears in the trailing PKCS#7 signature of nvidia.ko.
    Fails the build if the modules tarball is signed by a different
    key than the kernel embeds.

  * Extend measurements.json with .kernel.{filename,sha256} so
    05-install-uvm.sh can validate the kernel post-pull.

  * Push the kernel files (vmlinuz/vmlinux/System.map/config and
    kernel.basename) into the OCI artifact with media type
    application/vnd.cohere.kata-uvm.kernel+octet-stream, and surface
    the kernel-basename + kernel-sha256 as OCI annotations.

After this, the UVM artifact is self-contained: pulling and installing
it places a kernel and rootfs that share a signing key, so guest
modprobe of nvidia.ko / nvidia-uvm.ko / nvidia-modeset.ko / nvidia-drm.ko
/ nvidia-peermem.ko succeeds and NVRC boots cleanly.
---
 .github/workflows/build-kata-uvm-cohere.yaml | 140 +++++++++++++++++++
 1 file changed, 140 insertions(+)

diff --git a/.github/workflows/build-kata-uvm-cohere.yaml b/.github/workflows/build-kata-uvm-cohere.yaml
index 8099af0be4..f7a16ab177 100644
--- a/.github/workflows/build-kata-uvm-cohere.yaml
+++ b/.github/workflows/build-kata-uvm-cohere.yaml
@@ -316,6 +316,33 @@ jobs:
           echo "----- updated versions.yaml (nvidia.nvat) -----"
           yq '.externals.nvidia.nvat' versions.yaml
 
+      - name: Force a clean kernel + rootfs rebuild when overriding the driver
+        # Mirrors KATA_NVIDIA_FORCE_REBUILD in
+        # fortress/scratch/oci-b200/k8s/04-build-uvm-locally.sh. Each kata
+        # kernel build generates a fresh random `certs/signing_key.pem`,
+        # and the NVIDIA modules in
+        # `kata-static-kernel-nvidia-gpu-modules.tar.zst` are signed
+        # against THAT key. If kata's make reuses any cached kernel-
+        # nvidia-gpu/ artifacts while we're trying to bump the driver
+        # version, we end up with userspace<->kernel ABI skew or, worse,
+        # NVIDIA `.ko` files signed against a different key than the
+        # one the kernel binary embeds (README "Bug F"). On a fresh CI
+        # runner this is a no-op, but if we ever start caching the
+        # kata checkout between runs (or someone re-runs a job with a
+        # different driver_ver) the wipe makes the build deterministic.
+        if: needs.meta.outputs.kata_nvidia_driver_ver != ''
+        run: |
+          set -eux
+          BUILD=/tmp/kata/tools/packaging/kata-deploy/local-build/build
+          rm -rf  "$BUILD/kernel-nvidia-gpu" \
+                  "$BUILD/kata-static-kernel-nvidia-gpu-modules.tar.zst" \
+                  "$BUILD/kata-static-kernel-nvidia-gpu.tar.zst" \
+                  "$BUILD/rootfs-image-nvidia-gpu-confidential" \
+                  "$BUILD/rootfs-nvidia-gpu-confidential-stage-one" \
+                  "$BUILD/kata-static-rootfs-image-nvidia-gpu-confidential.tar.zst" \
+                  2>/dev/null || true
+          echo "wiped kernel-nvidia-gpu/, modules tarball, and rootfs build dirs"
+
       - name: Build rootfs-image-nvidia-gpu-confidential
         env:
           NVIDIA_GPU_STACK: ${{ needs.meta.outputs.nvidia_gpu_stack }}
@@ -379,6 +406,78 @@ jobs:
           cat /tmp/uvm-out/root_hash.txt
           file /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img
 
+      - name: Stage paired kernel binary alongside the rootfs
+        # WHY: kata's kernel-nvidia-gpu build emits a fresh random
+        # `certs/signing_key.pem` per invocation and uses it to sign both
+        # the embedded NVIDIA modules and (transitively) the modules
+        # tarball that nvidia_rootfs.sh extracts into the rootfs. The
+        # rootfs we produced in the previous step therefore carries
+        # NVIDIA `.ko` files signed against THIS build's key. If the
+        # host running the resulting UVM uses a kernel from a different
+        # build (e.g. the kata-deploy-bundled one), the modules are
+        # rejected at first modprobe, NVRC panics, the guest powers
+        # down, and pods sit in Pending. Verified end-to-end on
+        # 2026-05-15 (README "Bug F"). The fix on the install side
+        # (fortress/scratch/oci-b200/k8s/05-install-uvm.sh) is to
+        # atomically install both the kernel and the rootfs from the
+        # same build. For that to work, the OCI artifact has to ship
+        # the kernel binary alongside the rootfs.
+        run: |
+          set -euxo pipefail
+          KBUILD_DESTDIR=/tmp/kata/tools/packaging/kata-deploy/local-build/build/kernel-nvidia-gpu/destdir/opt/kata/share/kata-containers
+          KVER_FILE=$(ls "${KBUILD_DESTDIR}"/vmlinuz-*-nvidia-gpu 2>/dev/null | head -n1 || true)
+          if [[ -z "$KVER_FILE" ]]; then
+            echo "FATAL: no locally-built kernel at ${KBUILD_DESTDIR}/vmlinuz-*-nvidia-gpu" >&2
+            exit 1
+          fi
+          KVER_BASENAME=$(basename "$KVER_FILE")
+          KVER_VERSION="${KVER_BASENAME#vmlinuz-}"
+          cp -p "$KVER_FILE" "/tmp/uvm-out/${KVER_BASENAME}"
+          for sib in "vmlinux-${KVER_VERSION}" "System.map-${KVER_VERSION}" "config-${KVER_VERSION}"; do
+            [[ -f "${KBUILD_DESTDIR}/${sib}" ]] && cp -p "${KBUILD_DESTDIR}/${sib}" /tmp/uvm-out/
+          done
+          # Single source of truth for which kernel pairs with this rootfs;
+          # 05-install-uvm.sh reads this on the install side.
+          echo "$KVER_BASENAME" > /tmp/uvm-out/kernel.basename
+          ls -lh /tmp/uvm-out/
+
+      - name: Verify NVIDIA modules signing key matches the kernel
+        # Defensive check that the SKID embedded in the kernel's
+        # `certs/signing_key.x509` appears (raw-bytes hex-encoded)
+        # somewhere in the trailing PKCS#7 signature of the NVIDIA
+        # modules tarball's nvidia.ko. Same gate fortress's
+        # 04-build-uvm-locally.sh applies. On a clean CI runner this
+        # should always pass; if it ever fails we catch it here, in
+        # CI, instead of via guest serial capture in production.
+        run: |
+          set -euxo pipefail
+          KBASENAME=$(cat /tmp/uvm-out/kernel.basename)
+          KVER_VERSION="${KBASENAME#vmlinuz-}"
+          SIGNING_X509=/tmp/kata/tools/packaging/kata-deploy/local-build/build/kernel-nvidia-gpu/builddir/kata-linux-${KVER_VERSION}/certs/signing_key.x509
+          MODULES_TARBALL=/tmp/kata/tools/packaging/kata-deploy/local-build/build/kata-static-kernel-nvidia-gpu-modules.tar.zst
+          if [[ ! -f "$SIGNING_X509" || ! -f "$MODULES_TARBALL" ]]; then
+            echo "WARN: skipping signing-key check (missing $SIGNING_X509 or $MODULES_TARBALL)"
+            exit 0
+          fi
+          KEY_SKID=$(openssl x509 -in "$SIGNING_X509" -noout -text \
+                       | awk '/X509v3 Subject Key Identifier/{getline; gsub(/[: ]/,""); print tolower($0); exit}')
+          if [[ -z "$KEY_SKID" ]]; then
+            echo "WARN: could not extract SKID from $SIGNING_X509"; exit 0
+          fi
+          TMP=$(mktemp -d)
+          tar --zstd -xf "$MODULES_TARBALL" -C "$TMP" --wildcards '*/kernel/drivers/video/nvidia.ko'
+          SAMPLE_KO=$(find "$TMP" -name nvidia.ko -print -quit)
+          if [[ -z "$SAMPLE_KO" ]]; then
+            echo "WARN: no nvidia.ko in $MODULES_TARBALL"; exit 0
+          fi
+          if xxd -p -c 999999 "$SAMPLE_KO" | grep -qi "$KEY_SKID"; then
+            echo "OK: nvidia.ko signed by this build's signing key (SKID=$KEY_SKID)"
+          else
+            echo "FATAL: nvidia.ko in modules tarball is NOT signed by the kernel's signing_key.x509 (SKID=$KEY_SKID)"
+            echo "       guest will reject NVIDIA modules at first modprobe; pod will sit in Pending"
+            exit 1
+          fi
+
       - name: Surface verity params as JSON metadata
         id: measure
         # The root_hash.txt file is the source of truth for kata's
@@ -422,6 +521,17 @@ jobs:
           # a dangling symlink or an empty file.
           [[ "$IMG_BYTES" -gt 104857600 ]] || { echo "image suspiciously small: $IMG_BYTES bytes"; exit 1; }
 
+          # Paired kernel binary surfaced by the "Stage paired kernel" step.
+          # The install side (fortress/scratch/oci-b200/k8s/05-install-uvm.sh)
+          # uses .kernel.{filename,sha256} from measurements.json to validate
+          # and atomically install the kernel alongside the rootfs.
+          KERNEL_BASENAME=""
+          KERNEL_SHA=""
+          if [[ -f /tmp/uvm-out/kernel.basename ]]; then
+            KERNEL_BASENAME=$(cat /tmp/uvm-out/kernel.basename)
+            KERNEL_SHA=$(sha256sum "/tmp/uvm-out/${KERNEL_BASENAME}" | awk '{print $1}')
+          fi
+
           # Resolve the *actual* baked-in driver / nvat pins from versions.yaml
           # so the artifact reports what is really installed, not just what
           # was requested. (When the kata_* inputs are empty we want kata's
@@ -443,6 +553,8 @@ jobs:
             --arg hash_block_sz  "$HASH_BLOCK_SIZE" \
             --arg img_sha256     "$IMG_SHA256" \
             --arg img_bytes      "$IMG_BYTES" \
+            --arg kernel_name    "$KERNEL_BASENAME" \
+            --arg kernel_sha     "$KERNEL_SHA" \
             --arg caa_commit     "$GITHUB_SHA" \
             --arg build_date     "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
             '{
@@ -464,6 +576,8 @@ jobs:
                 sha256:   $img_sha256,
                 bytes:    ($img_bytes | tonumber)
               },
+              kernel: ( if $kernel_name == "" then null
+                        else {filename: $kernel_name, sha256: $kernel_sha} end ),
               source: {caa_commit: $caa_commit, build_date: $build_date}
             }' > /tmp/uvm-out/measurements.json
 
@@ -473,6 +587,8 @@ jobs:
           echo "img_sha256=$IMG_SHA256" >> "$GITHUB_OUTPUT"
           echo "driver_ver=$DRIVER_VER" >> "$GITHUB_OUTPUT"
           echo "nvat_ver=$NVAT_VER" >> "$GITHUB_OUTPUT"
+          echo "kernel_basename=$KERNEL_BASENAME" >> "$GITHUB_OUTPUT"
+          echo "kernel_sha=$KERNEL_SHA" >> "$GITHUB_OUTPUT"
 
       - name: Compress .image for transport
         run: |
@@ -508,10 +624,30 @@ jobs:
           NVAT_ANNOTATION=()
           [[ -n "$NVAT_VER" ]] && NVAT_ANNOTATION+=(--annotation "com.cohere.kata-uvm.nvat-sdk=${NVAT_VER}")
 
+          # The paired kernel (and its sibling artifacts) MUST ride along
+          # with the rootfs in the same OCI artifact so 05-install-uvm.sh
+          # can install both atomically. See README "Bug F" for the full
+          # mechanism. We push the kernel uncompressed (~80 MiB raw); zstd
+          # would only shave ~20 MiB and complicates the install side.
+          KERNEL_FILES=()
+          if [[ -f kernel.basename ]]; then
+            KBASENAME=$(cat kernel.basename)
+            KVER_VERSION="${KBASENAME#vmlinuz-}"
+            for kf in "$KBASENAME" "kernel.basename" \
+                      "vmlinux-${KVER_VERSION}" \
+                      "System.map-${KVER_VERSION}" \
+                      "config-${KVER_VERSION}"; do
+              if [[ -f "$kf" ]]; then
+                KERNEL_FILES+=( "${kf}:application/vnd.cohere.kata-uvm.kernel+octet-stream" )
+              fi
+            done
+          fi
+
           oras push "$OCI_REF" \
             kata-containers-nvidia-gpu-confidential.img.zst:application/vnd.cohere.kata-uvm.image+zstd \
             root_hash.txt:application/vnd.cohere.kata-uvm.verity+plain \
             measurements.json:application/vnd.cohere.kata-uvm.measurements+json \
+            "${KERNEL_FILES[@]}" \
             --annotation "org.opencontainers.image.title=kata-uvm-nvidia-gpu-confidential" \
             --annotation "org.opencontainers.image.description=Kata Containers NVIDIA GPU confidential UVM image, built from source with cohere-ai/guest-components" \
             --annotation "org.opencontainers.image.source=https://github.com/${GITHUB_REPOSITORY}" \
@@ -525,6 +661,8 @@ jobs:
             "${NVAT_ANNOTATION[@]}" \
             --annotation "com.cohere.kata-uvm.image-sha256=${IMG_SHA256}" \
             --annotation "com.cohere.kata-uvm.root-hash=${ROOT_HASH}" \
+            --annotation "com.cohere.kata-uvm.kernel-basename=${{ steps.measure.outputs.kernel_basename }}" \
+            --annotation "com.cohere.kata-uvm.kernel-sha256=${{ steps.measure.outputs.kernel_sha }}" \
             --format json > oras-output.json
 
           cat oras-output.json
@@ -559,6 +697,8 @@ jobs:
             echo "| NVIDIA stack | \`${{ needs.meta.outputs.nvidia_gpu_stack }}\` |"
             echo "| root_hash | \`${{ steps.measure.outputs.root_hash }}\` |"
             echo "| image sha256 | \`${{ steps.measure.outputs.img_sha256 }}\` |"
+            echo "| kernel | \`${{ steps.measure.outputs.kernel_basename }}\` |"
+            echo "| kernel sha256 | \`${{ steps.measure.outputs.kernel_sha }}\` |"
             echo ""
             echo "Install on a B200 host with:"
             echo ""