From 2d808338c5d379a4b5e4bdbf9a653e3c73f2e8b0 Mon Sep 17 00:00:00 2001 From: Alhassan Khedr Date: Thu, 14 May 2026 14:20:39 -0400 Subject: [PATCH 1/8] ci: build NVIDIA GPU confidential Kata UVM image from source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a workflow that builds the kata-containers nvidia-gpu-confidential UVM image with our cohere-fork guest-components (attestation-agent + api-server-rest) baked in *at compile time*, instead of post-hoc patching the stock NVIDIA image with `losetup` + `veritysetup format` (which is what fortress/scratch/oci-b200/k8s/06-patch-uvm.sh has been doing). Mechanics: 1. Check out kata-containers @ inputs.kata_ref (default 3.30.0). 2. Rewrite versions.yaml: point externals.coco-guest-components.url and .version at cohere-ai/guest-components @ (resolved to a SHA via git ls-remote so the build is reproducible). 3. `make rootfs-image-nvidia-gpu-confidential-tarball` — kata's existing build infrastructure clones our fork into the coco-guest-components builder container, statically builds AA + api-server-rest + CDH, and nvidia_rootfs.sh::coco_guest_components() copies them into the rootfs at /usr/local/bin/. From there the standard rootfs assembly + dm-verity formatting runs unchanged. 4. Extract the .image and root_hash file from the tarball, surface dm-verity params (root_hash, salt, data_blocks, block sizes) and the image sha256 as a measurements.json layer. 5. zstd -19 the .image, push to GHCR via oras as a 3-layer artifact with annotations covering build provenance + verity params. 6. SLSA build provenance attestation. Output: ghcr.io/cohere-ai/cloud-api-adaptor/kata-uvm-nvidia-gpu-confidential: where is `cohere-latest` for branch pushes, `kata-${kata_ref}-gc-${gc_ref}` for workflow_dispatch, or the literal tag for `kata-uvm-v*` tag pushes. Companion host-side install script lives at fortress/scratch/oci-b200/k8s/08-install-uvm.sh: it pulls this artifact, verifies sha256 against measurements.json, and rewrites kernel_verity_params in the kata config from the manifest. No host veritysetup needed. NOTE: this commit also temporarily adds `alhassankhedr/build-kata-uvm-cohere` to `on.push.branches` so we can validate end-to-end on the PR branch before merge. That entry must be removed before this lands on cohere. --- .github/workflows/build-kata-uvm-cohere.yaml | 382 +++++++++++++++++++ 1 file changed, 382 insertions(+) create mode 100644 .github/workflows/build-kata-uvm-cohere.yaml diff --git a/.github/workflows/build-kata-uvm-cohere.yaml b/.github/workflows/build-kata-uvm-cohere.yaml new file mode 100644 index 0000000000..3f7008ade2 --- /dev/null +++ b/.github/workflows/build-kata-uvm-cohere.yaml @@ -0,0 +1,382 @@ +name: Build Kata UVM Image (Cohere NVIDIA GPU Confidential) + +# Build the Kata Containers NVIDIA-GPU-confidential UVM image with our +# attestation-agent + api-server-rest baked in *from source*, instead of +# post-hoc patching the stock NVIDIA image (which is what +# fortress/scratch/oci-b200/k8s/06-patch-uvm.sh does). +# +# How: +# 1. Check out kata-containers @ ${kata_ref}. +# 2. Rewrite versions.yaml: point externals.coco-guest-components.url / +# .version at our cohere-ai/guest-components fork. The kata build +# driver clones that and statically builds AA + api-server-rest + +# CDH. nvidia_rootfs.sh's coco_guest_components() step then copies +# those binaries into the final UVM rootfs at /usr/local/bin/. +# 3. Run `make rootfs-image-nvidia-gpu-confidential-tarball` (which also +# builds agent, busybox, pause-image, coco-guest-components, and +# kernel-nvidia-gpu under the hood — every dep is containerised by +# kata-deploy-binaries-in-docker.sh, so the runner just needs Docker). +# 4. Extract the .image + root_hash file from the tarball. +# 5. Push to GHCR as an OCI artifact with the dm-verity params surfaced +# as annotations so the host install script can wire kata config +# without re-running `veritysetup format`. +# +# Output OCI ref: +# ghcr.io/${{ github.repository }}/kata-uvm-nvidia-gpu-confidential: +# +# Companion install script (consumes this artifact on a B200 host): +# fortress/scratch/oci-b200/k8s/08-install-uvm.sh + +on: + push: + tags: ["kata-uvm-v*"] + branches: + - "cohere" + # TEMPORARY: enable end-to-end validation of the workflow on the + # feature branch before merge. Remove this entry as part of the + # final review; only `cohere` should remain. + - "alhassankhedr/build-kata-uvm-cohere" + paths: + - ".github/workflows/build-kata-uvm-cohere.yaml" + workflow_dispatch: + inputs: + kata_ref: + description: "kata-containers ref to build from (tag, branch, or SHA)" + required: false + type: string + default: "3.30.0" + kata_repo: + description: "kata-containers repo URL" + required: false + type: string + default: "https://github.com/kata-containers/kata-containers.git" + gc_repo: + description: "guest-components repo URL" + required: false + type: string + default: "https://github.com/cohere-ai/guest-components.git" + gc_ref: + description: "guest-components ref (branch, tag, or SHA)" + required: false + type: string + default: "cohere" + nvidia_gpu_stack: + description: "NVIDIA GPU stack components (driver= is added from versions.yaml)" + required: false + type: string + default: "compute,dcgm,nvswitch" + tag_suffix: + description: "Optional suffix appended to the OCI tag (e.g. for ad-hoc test builds)" + required: false + type: string + default: "" + +permissions: + id-token: write + attestations: write + contents: read + packages: write + +env: + OCI_IMAGE: ghcr.io/${{ github.repository }}/kata-uvm-nvidia-gpu-confidential + +jobs: + meta: + name: Compute metadata + runs-on: ubuntu-latest + outputs: + tag: ${{ steps.compute.outputs.tag }} + kata_ref: ${{ steps.compute.outputs.kata_ref }} + gc_repo: ${{ steps.compute.outputs.gc_repo }} + gc_ref: ${{ steps.compute.outputs.gc_ref }} + nvidia_gpu_stack: ${{ steps.compute.outputs.nvidia_gpu_stack }} + steps: + - name: Compute tag and inputs + id: compute + env: + KATA_REF: ${{ inputs.kata_ref || '3.30.0' }} + GC_REPO: ${{ inputs.gc_repo || 'https://github.com/cohere-ai/guest-components.git' }} + GC_REF: ${{ inputs.gc_ref || 'cohere' }} + STACK: ${{ inputs.nvidia_gpu_stack || 'compute,dcgm,nvswitch' }} + SUFFIX: ${{ inputs.tag_suffix || '' }} + run: | + # Tag pattern: + # kata-uvm-v* push -> use the tag literal (after stripping `kata-uvm-`) + # workflow_dispatch -> kata-${KATA_REF}-gc-${GC_REF_SHORT}[suffix] + # branch push -> cohere-latest + if [[ "$GITHUB_REF" == refs/tags/kata-uvm-v* ]]; then + TAG="${GITHUB_REF#refs/tags/kata-uvm-}" + elif [[ "${GITHUB_EVENT_NAME}" == "workflow_dispatch" ]]; then + GC_SHORT="${GC_REF//\//-}" + GC_SHORT="${GC_SHORT:0:12}" + TAG="kata-${KATA_REF//\//-}-gc-${GC_SHORT}" + else + TAG="cohere-latest" + fi + [ -n "$SUFFIX" ] && TAG="${TAG}-${SUFFIX}" + # OCI tags can't have '+' or unbounded length; sanitize. + TAG="${TAG//+/-}" + { + echo "tag=$TAG" + echo "kata_ref=$KATA_REF" + echo "gc_repo=$GC_REPO" + echo "gc_ref=$GC_REF" + echo "nvidia_gpu_stack=$STACK" + } >> "$GITHUB_OUTPUT" + + build: + name: Build kata UVM (nvidia-gpu-confidential) + needs: meta + runs-on: ubuntu-latest + timeout-minutes: 180 + steps: + - name: Free up runner disk space + # The kata build pulls a CUDA repo + NVIDIA drivers into a chroot + # and a kernel build alongside. Default ubuntu-latest leaves ~14G; + # we need ~40G or the rootfs build OOMs the disk. + run: | + set -eux + df -h / + sudo rm -rf /usr/local/lib/android /usr/share/dotnet /opt/ghc \ + /usr/local/share/boost /opt/hostedtoolcache/CodeQL \ + /usr/local/share/powershell /usr/local/share/chromium + sudo apt-get purge -y google-cloud-cli azure-cli microsoft-edge-stable \ + dotnet-* aspnetcore-* mongodb-* mysql-* 2>/dev/null || true + sudo apt-get autoremove -y + sudo apt-get clean + docker system prune -af --volumes 2>/dev/null || true + df -h / + + - name: Install host build dependencies + run: | + sudo apt-get update -qq + sudo apt-get install -y --no-install-recommends \ + git make curl ca-certificates jq python3 python3-pip + # Ensure yq is present (kata's build scripts rely on it). + if ! command -v yq >/dev/null 2>&1; then + sudo curl -fsSL -o /usr/local/bin/yq \ + https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 + sudo chmod +x /usr/local/bin/yq + fi + yq --version + + - name: Install ORAS + # Pin via fortress/CAA convention: read from caa's versions.yaml so + # we stay in lockstep. Fallback to a known-good version if the file + # is unavailable for some reason. + run: | + ORAS_VERSION=1.2.0 + curl -fsSLO "https://github.com/oras-project/oras/releases/download/v${ORAS_VERSION}/oras_${ORAS_VERSION}_linux_amd64.tar.gz" + tar -xzf "oras_${ORAS_VERSION}_linux_amd64.tar.gz" oras + sudo mv oras /usr/local/bin/ + rm -f "oras_${ORAS_VERSION}_linux_amd64.tar.gz" + oras version + + - name: Checkout kata-containers @ ${{ needs.meta.outputs.kata_ref }} + run: | + set -eux + git clone --depth 1 --branch "${{ needs.meta.outputs.kata_ref }}" \ + "${{ inputs.kata_repo || 'https://github.com/kata-containers/kata-containers.git' }}" \ + /tmp/kata + ( cd /tmp/kata && git rev-parse HEAD ) + + - name: Override coco-guest-components in versions.yaml + # This is the key step: tell kata's coco-guest-components builder + # to clone our cohere-ai fork at our chosen ref. Everything + # downstream (rootfs assembly, dm-verity, root_hash) is unchanged + # and uses these binaries as if they had come from upstream. + env: + GC_REPO: ${{ needs.meta.outputs.gc_repo }} + GC_REF: ${{ needs.meta.outputs.gc_ref }} + run: | + set -eux + cd /tmp/kata + # Resolve gc_ref to a SHA so the build is reproducible. We do + # this with `git ls-remote` rather than cloning the whole tree. + GC_SHA=$(git ls-remote "${GC_REPO}" "${GC_REF}" | awk '{print $1}' | head -n1) + if [[ -z "$GC_SHA" ]]; then + # Maybe gc_ref already IS a SHA; let downstream fail loudly if not. + GC_SHA="${GC_REF}" + fi + echo "Resolved guest-components ref ${GC_REF} -> ${GC_SHA}" + + yq -i \ + ".externals.\"coco-guest-components\".url = \"${GC_REPO}\" | + .externals.\"coco-guest-components\".version = \"${GC_SHA}\"" \ + versions.yaml + + echo "----- updated versions.yaml (coco-guest-components) -----" + yq '.externals."coco-guest-components"' versions.yaml + + - name: Build rootfs-image-nvidia-gpu-confidential + env: + NVIDIA_GPU_STACK: ${{ needs.meta.outputs.nvidia_gpu_stack }} + run: | + set -eux + cd /tmp/kata/tools/packaging/kata-deploy/local-build + # `make -tarball` chains all the Docker-isolated builds + # (agent, busybox, pause-image, coco-guest-components, + # kernel-nvidia-gpu) before running the rootfs assembly. Each + # sub-build runs in its own ephemeral container, so we don't + # need to install rust/go/etc on the host. + NVIDIA_GPU_STACK="$NVIDIA_GPU_STACK" \ + make rootfs-image-nvidia-gpu-confidential-tarball + + ls -lh build/ + + - name: Extract .image and root_hash from the tarball + run: | + set -eux + cd /tmp/kata/tools/packaging/kata-deploy/local-build/build + TARBALL=kata-static-rootfs-image-nvidia-gpu-confidential.tar.zst + [[ -f "$TARBALL" ]] || { echo "FATAL: $TARBALL missing"; exit 1; } + + mkdir -p /tmp/uvm-out + # Tarball layout: + # ./opt/kata/share/kata-containers/kata-containers-nvidia-gpu-confidential.img + # ./opt/kata/share/kata-containers/root_hash_nvidia-gpu-confidential.txt + tar --zstd -xvf "$TARBALL" -C /tmp/uvm-out + mv /tmp/uvm-out/opt/kata/share/kata-containers/kata-containers-nvidia-gpu-confidential.img \ + /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img + mv /tmp/uvm-out/opt/kata/share/kata-containers/root_hash_nvidia-gpu-confidential.txt \ + /tmp/uvm-out/root_hash.txt + rm -rf /tmp/uvm-out/opt + ls -lh /tmp/uvm-out/ + echo "----- root_hash.txt -----" + cat /tmp/uvm-out/root_hash.txt + + - name: Surface verity params as JSON metadata + id: measure + # The root_hash.txt file is the source of truth for kata's + # `kernel_verity_params` (root_hash, salt, data_blocks, etc). + # We re-emit those values as a flat JSON file so the host install + # script can parse them without invoking veritysetup. + run: | + set -eux + ROOT_HASH=$(awk -F'=' '/^root_hash=/ {print $2}' /tmp/uvm-out/root_hash.txt) + SALT=$(awk -F'=' '/^salt=/ {print $2}' /tmp/uvm-out/root_hash.txt) + DATA_BLOCKS=$(awk -F'=' '/^data_blocks=/ {print $2}' /tmp/uvm-out/root_hash.txt) + DATA_BLOCK_SIZE=$(awk -F'=' '/^data_block_size=/ {print $2}' /tmp/uvm-out/root_hash.txt) + HASH_BLOCK_SIZE=$(awk -F'=' '/^hash_block_size=/ {print $2}' /tmp/uvm-out/root_hash.txt) + IMG_SHA256=$(sha256sum /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img | awk '{print $1}') + IMG_BYTES=$(stat -c %s /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img) + + jq -n \ + --arg kata_ref "${{ needs.meta.outputs.kata_ref }}" \ + --arg gc_repo "${{ needs.meta.outputs.gc_repo }}" \ + --arg gc_ref "${{ needs.meta.outputs.gc_ref }}" \ + --arg nvidia_stack "${{ needs.meta.outputs.nvidia_gpu_stack }}" \ + --arg root_hash "$ROOT_HASH" \ + --arg salt "$SALT" \ + --arg data_blocks "$DATA_BLOCKS" \ + --arg data_block_sz "$DATA_BLOCK_SIZE" \ + --arg hash_block_sz "$HASH_BLOCK_SIZE" \ + --arg img_sha256 "$IMG_SHA256" \ + --arg img_bytes "$IMG_BYTES" \ + --arg caa_commit "$GITHUB_SHA" \ + --arg build_date "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + '{ + kata_ref: $kata_ref, + guest_components: {repo: $gc_repo, ref: $gc_ref}, + nvidia_gpu_stack: $nvidia_stack, + dm_verity: { + root_hash: $root_hash, + salt: $salt, + data_blocks: ($data_blocks | tonumber), + data_block_size: ($data_block_sz | tonumber), + hash_block_size: ($hash_block_sz | tonumber) + }, + image: { + filename: "kata-containers-nvidia-gpu-confidential.img", + sha256: $img_sha256, + bytes: ($img_bytes | tonumber) + }, + source: {caa_commit: $caa_commit, build_date: $build_date} + }' > /tmp/uvm-out/measurements.json + + cat /tmp/uvm-out/measurements.json + echo "root_hash=$ROOT_HASH" >> "$GITHUB_OUTPUT" + echo "img_sha256=$IMG_SHA256" >> "$GITHUB_OUTPUT" + + - name: Compress .image for transport + run: | + set -eux + cd /tmp/uvm-out + # The raw .image is ~250 MiB; zstd brings it under 100 MiB which + # makes oras push fast on cold registries. + zstd -19 --long -T0 --rm kata-containers-nvidia-gpu-confidential.img \ + -o kata-containers-nvidia-gpu-confidential.img.zst + ls -lh + + - name: Login to GHCR + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Push artifact to GHCR + id: push + env: + OCI_TAG: ${{ needs.meta.outputs.tag }} + ROOT_HASH: ${{ steps.measure.outputs.root_hash }} + IMG_SHA256: ${{ steps.measure.outputs.img_sha256 }} + run: | + set -eux + OCI_REF="${OCI_IMAGE}:${OCI_TAG}" + cd /tmp/uvm-out + oras push "$OCI_REF" \ + kata-containers-nvidia-gpu-confidential.img.zst:application/vnd.cohere.kata-uvm.image+zstd \ + root_hash.txt:application/vnd.cohere.kata-uvm.verity+plain \ + measurements.json:application/vnd.cohere.kata-uvm.measurements+json \ + --annotation "org.opencontainers.image.title=kata-uvm-nvidia-gpu-confidential" \ + --annotation "org.opencontainers.image.description=Kata Containers NVIDIA GPU confidential UVM image, built from source with cohere-ai/guest-components" \ + --annotation "org.opencontainers.image.source=https://github.com/${GITHUB_REPOSITORY}" \ + --annotation "org.opencontainers.image.revision=${GITHUB_SHA}" \ + --annotation "org.opencontainers.image.created=$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ + --annotation "com.cohere.caa.commit=${GITHUB_SHA}" \ + --annotation "com.cohere.kata.ref=${{ needs.meta.outputs.kata_ref }}" \ + --annotation "com.cohere.guest-components.repo=${{ needs.meta.outputs.gc_repo }}" \ + --annotation "com.cohere.guest-components.ref=${{ needs.meta.outputs.gc_ref }}" \ + --annotation "com.cohere.kata-uvm.image-sha256=${IMG_SHA256}" \ + --annotation "com.cohere.kata-uvm.root-hash=${ROOT_HASH}" \ + --format json > oras-output.json + + cat oras-output.json + DIGEST=$(jq -r '.digest' oras-output.json) + { + echo "digest=$DIGEST" + echo "oci_ref=${OCI_REF}@${DIGEST}" + echo "oci_tag=$OCI_TAG" + } >> "$GITHUB_OUTPUT" + echo "Pushed: $OCI_REF @ $DIGEST" + + - name: Attest build provenance + uses: actions/attest-build-provenance@a2bbfa25375fe432b6a289bc6b6cd05ecd0c4c32 # v4 + with: + subject-name: ${{ env.OCI_IMAGE }} + subject-digest: ${{ steps.push.outputs.digest }} + push-to-registry: true + + - name: Job summary + run: | + { + echo "### Kata UVM image built" + echo "" + echo "| Field | Value |" + echo "| --- | --- |" + echo "| OCI ref | \`${OCI_IMAGE}:${{ needs.meta.outputs.tag }}\` |" + echo "| Digest | \`${{ steps.push.outputs.digest }}\` |" + echo "| kata-containers ref | \`${{ needs.meta.outputs.kata_ref }}\` |" + echo "| guest-components | \`${{ needs.meta.outputs.gc_repo }}@${{ needs.meta.outputs.gc_ref }}\` |" + echo "| NVIDIA stack | \`${{ needs.meta.outputs.nvidia_gpu_stack }}\` |" + echo "| root_hash | \`${{ steps.measure.outputs.root_hash }}\` |" + echo "| image sha256 | \`${{ steps.measure.outputs.img_sha256 }}\` |" + echo "" + echo "Install on a B200 host with:" + echo "" + echo '```bash' + echo "ORAS_REF=${OCI_IMAGE}:${{ needs.meta.outputs.tag }} \\" + echo " bash fortress/scratch/oci-b200/k8s/08-install-uvm.sh" + echo '```' + } >> "$GITHUB_STEP_SUMMARY" From d309d64c5d72bd2ef68814f6f20dbbab2190f4ac Mon Sep 17 00:00:00 2001 From: Alhassan Khedr Date: Thu, 14 May 2026 14:50:58 -0400 Subject: [PATCH 2/8] ci: update install-script references to fortress k8s/05-install-uvm.sh Companion to fortress's k8s/ script reordering. The CI workflow's header comments and the GHCR step summary now point at the new numbering (05-install-uvm.sh) and reference the legacy patch path (08-patch-uvm.sh) by its new number too. --- .github/workflows/build-kata-uvm-cohere.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-kata-uvm-cohere.yaml b/.github/workflows/build-kata-uvm-cohere.yaml index 3f7008ade2..ae2f857efb 100644 --- a/.github/workflows/build-kata-uvm-cohere.yaml +++ b/.github/workflows/build-kata-uvm-cohere.yaml @@ -2,8 +2,8 @@ name: Build Kata UVM Image (Cohere NVIDIA GPU Confidential) # Build the Kata Containers NVIDIA-GPU-confidential UVM image with our # attestation-agent + api-server-rest baked in *from source*, instead of -# post-hoc patching the stock NVIDIA image (which is what -# fortress/scratch/oci-b200/k8s/06-patch-uvm.sh does). +# post-hoc patching the stock NVIDIA image (which is what the legacy +# fortress/scratch/oci-b200/k8s/08-patch-uvm.sh does). # # How: # 1. Check out kata-containers @ ${kata_ref}. @@ -25,7 +25,7 @@ name: Build Kata UVM Image (Cohere NVIDIA GPU Confidential) # ghcr.io/${{ github.repository }}/kata-uvm-nvidia-gpu-confidential: # # Companion install script (consumes this artifact on a B200 host): -# fortress/scratch/oci-b200/k8s/08-install-uvm.sh +# fortress/scratch/oci-b200/k8s/05-install-uvm.sh on: push: @@ -377,6 +377,6 @@ jobs: echo "" echo '```bash' echo "ORAS_REF=${OCI_IMAGE}:${{ needs.meta.outputs.tag }} \\" - echo " bash fortress/scratch/oci-b200/k8s/08-install-uvm.sh" + echo " bash fortress/scratch/oci-b200/k8s/05-install-uvm.sh" echo '```' } >> "$GITHUB_STEP_SUMMARY" From fccd4d7b48966e3a55b0bf7e4a4c18486d54d644 Mon Sep 17 00:00:00 2001 From: Alhassan Khedr Date: Thu, 14 May 2026 15:22:57 -0400 Subject: [PATCH 3/8] fix(ci): auto-pin driver= in NVIDIA_GPU_STACK from kata versions.yaml kata 3.30+ nvidia_chroot.sh runs with set -u and only assigns driver_version when NVIDIA_GPU_STACK contains a literal `driver=` component. Without it the rootfs-assembly stage dies at the very last step with `driver_version: unbound variable`, after the runner has already done ~45 minutes of work (agent, busybox, pause-image, coco-guest-components, kernel-nvidia-gpu). This is exactly how run 25877534335 failed. Fix: derive the driver pin from .assets.nvidia.driver.version in kata's own versions.yaml and prepend driver= to NVIDIA_GPU_STACK in the build step. Auto-tracks kata_ref. --- .github/workflows/build-kata-uvm-cohere.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/.github/workflows/build-kata-uvm-cohere.yaml b/.github/workflows/build-kata-uvm-cohere.yaml index ae2f857efb..0bdb91d910 100644 --- a/.github/workflows/build-kata-uvm-cohere.yaml +++ b/.github/workflows/build-kata-uvm-cohere.yaml @@ -214,6 +214,22 @@ jobs: run: | set -eux cd /tmp/kata/tools/packaging/kata-deploy/local-build + # kata 3.30+ nvidia_chroot.sh runs with `set -u` and only assigns + # driver_version when NVIDIA_GPU_STACK contains a literal + # `driver=` (tools/osbuilder/rootfs-builder/nvidia/ + # nvidia_chroot.sh::install_userspace_components). Without it, + # the rootfs-assembly stage dies at the very end with + # `driver_version: unbound variable`. Derive the canonical pin + # from kata's own versions.yaml so this auto-tracks KATA_REF. + if [[ ",${NVIDIA_GPU_STACK}," != *",driver="* ]]; then + DRIVER_VER=$(yq '.externals.nvidia.driver.version' /tmp/kata/versions.yaml 2>/dev/null | tr -d '"') + if [[ -n "$DRIVER_VER" && "$DRIVER_VER" != "null" ]]; then + NVIDIA_GPU_STACK="driver=${DRIVER_VER},${NVIDIA_GPU_STACK}" + echo "Prepended driver=${DRIVER_VER} (from versions.yaml) -> ${NVIDIA_GPU_STACK}" + else + echo "WARN: could not resolve .externals.nvidia.driver.version from /tmp/kata/versions.yaml" >&2 + fi + fi # `make -tarball` chains all the Docker-isolated builds # (agent, busybox, pause-image, coco-guest-components, # kernel-nvidia-gpu) before running the rootfs assembly. Each From 55ea6da22b6682c49f0d7969ca2a9c5473eeceae Mon Sep 17 00:00:00 2001 From: Alhassan Khedr Date: Thu, 14 May 2026 23:08:44 -0400 Subject: [PATCH 4/8] fix(ci): correctly parse root_hash.txt and dereference image symlink Two bugs in the "Extract" / "Surface verity params" steps that together caused the workflow to abort with `jq: error ... Expected JSON value (while parsing '')` (exit 5) and would also have produced a junk artifact even if jq had not failed: 1. root_hash.txt is a SINGLE comma-separated line written by kata's osbuilder, not five newline-separated key=value lines. The previous `awk -F'=' '/^salt=/ {print $2}'` parsers therefore returned empty strings for everything except root_hash (and even that came out with a trailing ",salt"), which crashed jq's `tonumber` on data_blocks. Replace with a single comma-split + case dispatch, plus regex sanity checks so a future format change fails loudly. 2. The .img inside the tarball is a symlink to the versioned .image alongside it. The previous `mv` only relocated the symlink, then `rm -rf opt/` deleted the underlying file. Resolve via `readlink -f` and `cp` the real file before tearing the directory down. Add a minimum-size assertion (>100 MiB) so a dangling symlink is caught immediately rather than producing measurements.json with bytes=57. Also tightens the shell with `set -euxo pipefail` and a `jq -e .` validation of the produced measurements.json. --- .github/workflows/build-kata-uvm-cohere.yaml | 58 ++++++++++++++++---- 1 file changed, 48 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build-kata-uvm-cohere.yaml b/.github/workflows/build-kata-uvm-cohere.yaml index 0bdb91d910..1da3a8329d 100644 --- a/.github/workflows/build-kata-uvm-cohere.yaml +++ b/.github/workflows/build-kata-uvm-cohere.yaml @@ -242,24 +242,34 @@ jobs: - name: Extract .image and root_hash from the tarball run: | - set -eux + set -euxo pipefail cd /tmp/kata/tools/packaging/kata-deploy/local-build/build TARBALL=kata-static-rootfs-image-nvidia-gpu-confidential.tar.zst [[ -f "$TARBALL" ]] || { echo "FATAL: $TARBALL missing"; exit 1; } mkdir -p /tmp/uvm-out - # Tarball layout: + # Tarball layout (the .img entry is a symlink to the + # versioned .image alongside it): # ./opt/kata/share/kata-containers/kata-containers-nvidia-gpu-confidential.img + # -> kata-ubuntu-noble-nvidia-gpu-confidential-.image + # ./opt/kata/share/kata-containers/kata-ubuntu-noble-nvidia-gpu-confidential-.image # ./opt/kata/share/kata-containers/root_hash_nvidia-gpu-confidential.txt tar --zstd -xvf "$TARBALL" -C /tmp/uvm-out - mv /tmp/uvm-out/opt/kata/share/kata-containers/kata-containers-nvidia-gpu-confidential.img \ - /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img + + # Resolve the symlink to the real image and copy it (not move), + # so the underlying file survives the `rm -rf opt/` below. + KATA_IMG_LINK=/tmp/uvm-out/opt/kata/share/kata-containers/kata-containers-nvidia-gpu-confidential.img + KATA_IMG_REAL=$(readlink -f "$KATA_IMG_LINK") + [[ -f "$KATA_IMG_REAL" ]] || { echo "FATAL: $KATA_IMG_LINK -> $KATA_IMG_REAL missing"; exit 1; } + cp --reflink=auto "$KATA_IMG_REAL" /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img + mv /tmp/uvm-out/opt/kata/share/kata-containers/root_hash_nvidia-gpu-confidential.txt \ /tmp/uvm-out/root_hash.txt rm -rf /tmp/uvm-out/opt ls -lh /tmp/uvm-out/ echo "----- root_hash.txt -----" cat /tmp/uvm-out/root_hash.txt + file /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img - name: Surface verity params as JSON metadata id: measure @@ -267,15 +277,42 @@ jobs: # `kernel_verity_params` (root_hash, salt, data_blocks, etc). # We re-emit those values as a flat JSON file so the host install # script can parse them without invoking veritysetup. + # + # NOTE: kata's osbuilder writes root_hash.txt as a single + # comma-separated line, e.g. + # root_hash=,salt=,data_blocks=N,data_block_size=4096,hash_block_size=4096 + # so we split on commas first, then on '=' to populate each var. run: | - set -eux - ROOT_HASH=$(awk -F'=' '/^root_hash=/ {print $2}' /tmp/uvm-out/root_hash.txt) - SALT=$(awk -F'=' '/^salt=/ {print $2}' /tmp/uvm-out/root_hash.txt) - DATA_BLOCKS=$(awk -F'=' '/^data_blocks=/ {print $2}' /tmp/uvm-out/root_hash.txt) - DATA_BLOCK_SIZE=$(awk -F'=' '/^data_block_size=/ {print $2}' /tmp/uvm-out/root_hash.txt) - HASH_BLOCK_SIZE=$(awk -F'=' '/^hash_block_size=/ {print $2}' /tmp/uvm-out/root_hash.txt) + set -euxo pipefail + + ROOT_HASH=""; SALT=""; DATA_BLOCKS="" + DATA_BLOCK_SIZE=""; HASH_BLOCK_SIZE="" + while IFS='=' read -r k v; do + case "$k" in + root_hash) ROOT_HASH=$v ;; + salt) SALT=$v ;; + data_blocks) DATA_BLOCKS=$v ;; + data_block_size) DATA_BLOCK_SIZE=$v ;; + hash_block_size) HASH_BLOCK_SIZE=$v ;; + esac + done < <(tr ',' '\n' < /tmp/uvm-out/root_hash.txt) + + # Fail loudly on parse regressions instead of producing a junk + # measurements.json with empty fields. + : "${ROOT_HASH:?root_hash missing from root_hash.txt}" + : "${SALT:?salt missing from root_hash.txt}" + : "${DATA_BLOCKS:?data_blocks missing from root_hash.txt}" + : "${DATA_BLOCK_SIZE:?data_block_size missing from root_hash.txt}" + : "${HASH_BLOCK_SIZE:?hash_block_size missing from root_hash.txt}" + [[ "$ROOT_HASH" =~ ^[0-9a-f]{64}$ ]] || { echo "bad root_hash: $ROOT_HASH"; exit 1; } + [[ "$SALT" =~ ^[0-9a-f]{64}$ ]] || { echo "bad salt: $SALT"; exit 1; } + IMG_SHA256=$(sha256sum /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img | awk '{print $1}') IMG_BYTES=$(stat -c %s /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img) + [[ "$IMG_SHA256" =~ ^[0-9a-f]{64}$ ]] || { echo "bad image sha256: $IMG_SHA256"; exit 1; } + # GPU UVM is always hundreds of MB; anything tiny means we measured + # a dangling symlink or an empty file. + [[ "$IMG_BYTES" -gt 104857600 ]] || { echo "image suspiciously small: $IMG_BYTES bytes"; exit 1; } jq -n \ --arg kata_ref "${{ needs.meta.outputs.kata_ref }}" \ @@ -310,6 +347,7 @@ jobs: source: {caa_commit: $caa_commit, build_date: $build_date} }' > /tmp/uvm-out/measurements.json + jq -e . /tmp/uvm-out/measurements.json >/dev/null cat /tmp/uvm-out/measurements.json echo "root_hash=$ROOT_HASH" >> "$GITHUB_OUTPUT" echo "img_sha256=$IMG_SHA256" >> "$GITHUB_OUTPUT" From b9c89df4f0e1e4d219f91989714f0a9e968aa740 Mon Sep 17 00:00:00 2001 From: Alhassan Khedr Date: Thu, 14 May 2026 23:14:23 -0400 Subject: [PATCH 5/8] ci: add kata_nvidia_driver_ver input to override the pinned NVIDIA driver Kata 3.30.0 pins driver=595.58.03 in versions.yaml, but on 8x B200 OCI hosts that driver hits a fabric-probe race where RmGpuFabricProbe times out and fail-stops GPU init. The fix landed in 595.71.05 (which is also the version present in the working mkosi-built images). This adds an optional workflow_dispatch input `kata_nvidia_driver_ver`. When set (e.g. to 595.71.05), the build: - Rewrites .externals.nvidia.driver.version in kata's versions.yaml before the rootfs build, so the pin flows through to both open-gpu-kernel-modules (cloned from the GitHub tag) and the nvidia-driver-pinning- apt package. - Surfaces the override in the OCI tag (kata-...-drv-), the com.cohere.kata-uvm.nvidia-driver annotation, measurements.json's new nvidia_driver.version field, and the job summary. When unset, the build behaves exactly as before. measurements.json always reflects the *actually baked-in* driver (read from the post-rewrite versions.yaml) rather than the requested input, so it stays truthful when the override is empty. Mirrors the same mechanic in fortress/scratch/oci-b200/k8s/04-build-uvm-locally.sh. --- .github/workflows/build-kata-uvm-cohere.yaml | 63 ++++++++++++++++++-- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-kata-uvm-cohere.yaml b/.github/workflows/build-kata-uvm-cohere.yaml index 1da3a8329d..af8fd36aa8 100644 --- a/.github/workflows/build-kata-uvm-cohere.yaml +++ b/.github/workflows/build-kata-uvm-cohere.yaml @@ -60,6 +60,17 @@ on: required: false type: string default: "cohere" + kata_nvidia_driver_ver: + description: | + Override .externals.nvidia.driver.version in kata's versions.yaml + (e.g. 595.71.05). Leave empty to use kata's default. The tag must + exist at https://github.com/NVIDIA/open-gpu-kernel-modules and a + matching nvidia-driver-pinning- package must exist in the + NVIDIA CUDA apt repo. Required to fix the 8x B200 fabric-probe + race in kata 3.30.0's default 595.58.03 driver pin. + required: false + type: string + default: "" nvidia_gpu_stack: description: "NVIDIA GPU stack components (driver= is added from versions.yaml)" required: false @@ -89,20 +100,22 @@ jobs: kata_ref: ${{ steps.compute.outputs.kata_ref }} gc_repo: ${{ steps.compute.outputs.gc_repo }} gc_ref: ${{ steps.compute.outputs.gc_ref }} + kata_nvidia_driver_ver: ${{ steps.compute.outputs.kata_nvidia_driver_ver }} nvidia_gpu_stack: ${{ steps.compute.outputs.nvidia_gpu_stack }} steps: - name: Compute tag and inputs id: compute env: - KATA_REF: ${{ inputs.kata_ref || '3.30.0' }} - GC_REPO: ${{ inputs.gc_repo || 'https://github.com/cohere-ai/guest-components.git' }} - GC_REF: ${{ inputs.gc_ref || 'cohere' }} - STACK: ${{ inputs.nvidia_gpu_stack || 'compute,dcgm,nvswitch' }} - SUFFIX: ${{ inputs.tag_suffix || '' }} + KATA_REF: ${{ inputs.kata_ref || '3.30.0' }} + GC_REPO: ${{ inputs.gc_repo || 'https://github.com/cohere-ai/guest-components.git' }} + GC_REF: ${{ inputs.gc_ref || 'cohere' }} + DRIVER_VER: ${{ inputs.kata_nvidia_driver_ver || '' }} + STACK: ${{ inputs.nvidia_gpu_stack || 'compute,dcgm,nvswitch' }} + SUFFIX: ${{ inputs.tag_suffix || '' }} run: | # Tag pattern: # kata-uvm-v* push -> use the tag literal (after stripping `kata-uvm-`) - # workflow_dispatch -> kata-${KATA_REF}-gc-${GC_REF_SHORT}[suffix] + # workflow_dispatch -> kata-${KATA_REF}-gc-${GC_REF_SHORT}[-drv-][suffix] # branch push -> cohere-latest if [[ "$GITHUB_REF" == refs/tags/kata-uvm-v* ]]; then TAG="${GITHUB_REF#refs/tags/kata-uvm-}" @@ -110,6 +123,10 @@ jobs: GC_SHORT="${GC_REF//\//-}" GC_SHORT="${GC_SHORT:0:12}" TAG="kata-${KATA_REF//\//-}-gc-${GC_SHORT}" + # If the caller overrode the NVIDIA driver pin, surface it in the + # OCI tag so the artifact name unambiguously identifies which + # driver is baked in. + [ -n "$DRIVER_VER" ] && TAG="${TAG}-drv-${DRIVER_VER}" else TAG="cohere-latest" fi @@ -121,6 +138,7 @@ jobs: echo "kata_ref=$KATA_REF" echo "gc_repo=$GC_REPO" echo "gc_ref=$GC_REF" + echo "kata_nvidia_driver_ver=$DRIVER_VER" echo "nvidia_gpu_stack=$STACK" } >> "$GITHUB_OUTPUT" @@ -208,6 +226,27 @@ jobs: echo "----- updated versions.yaml (coco-guest-components) -----" yq '.externals."coco-guest-components"' versions.yaml + - name: Override NVIDIA driver pin in versions.yaml + # kata 3.30.0's versions.yaml pins driver=595.58.03, but on 8x B200 + # OCI hosts that driver hits a fabric-probe race (kernel timeout in + # RmGpuFabricProbe -> fail-stop). The fix landed in 595.71.05. The + # build pulls open kernel modules from + # https://github.com/NVIDIA/open-gpu-kernel-modules tags, and the + # userspace via nvidia-driver-pinning- from the NVIDIA CUDA + # apt repo, so any version that exists in both places is a valid + # override. Skipped when input is empty. + if: needs.meta.outputs.kata_nvidia_driver_ver != '' + env: + DRIVER_VER: ${{ needs.meta.outputs.kata_nvidia_driver_ver }} + run: | + set -eux + cd /tmp/kata + OLD_VER=$(yq '.externals.nvidia.driver.version' versions.yaml | tr -d '"') + yq -i ".externals.nvidia.driver.version = \"${DRIVER_VER}\"" versions.yaml + echo "NVIDIA driver pin: ${OLD_VER} -> ${DRIVER_VER}" + echo "----- updated versions.yaml (nvidia.driver) -----" + yq '.externals.nvidia.driver' versions.yaml + - name: Build rootfs-image-nvidia-gpu-confidential env: NVIDIA_GPU_STACK: ${{ needs.meta.outputs.nvidia_gpu_stack }} @@ -314,10 +353,17 @@ jobs: # a dangling symlink or an empty file. [[ "$IMG_BYTES" -gt 104857600 ]] || { echo "image suspiciously small: $IMG_BYTES bytes"; exit 1; } + # Resolve the *actual* baked-in driver pin from versions.yaml so + # the artifact reports what is really installed, not just what + # was requested. (When kata_nvidia_driver_ver is empty we want + # kata's default to be reflected here.) + DRIVER_VER=$(yq '.externals.nvidia.driver.version' /tmp/kata/versions.yaml | tr -d '"') + jq -n \ --arg kata_ref "${{ needs.meta.outputs.kata_ref }}" \ --arg gc_repo "${{ needs.meta.outputs.gc_repo }}" \ --arg gc_ref "${{ needs.meta.outputs.gc_ref }}" \ + --arg driver_ver "$DRIVER_VER" \ --arg nvidia_stack "${{ needs.meta.outputs.nvidia_gpu_stack }}" \ --arg root_hash "$ROOT_HASH" \ --arg salt "$SALT" \ @@ -331,6 +377,7 @@ jobs: '{ kata_ref: $kata_ref, guest_components: {repo: $gc_repo, ref: $gc_ref}, + nvidia_driver: {version: $driver_ver}, nvidia_gpu_stack: $nvidia_stack, dm_verity: { root_hash: $root_hash, @@ -351,6 +398,7 @@ jobs: cat /tmp/uvm-out/measurements.json echo "root_hash=$ROOT_HASH" >> "$GITHUB_OUTPUT" echo "img_sha256=$IMG_SHA256" >> "$GITHUB_OUTPUT" + echo "driver_ver=$DRIVER_VER" >> "$GITHUB_OUTPUT" - name: Compress .image for transport run: | @@ -375,6 +423,7 @@ jobs: OCI_TAG: ${{ needs.meta.outputs.tag }} ROOT_HASH: ${{ steps.measure.outputs.root_hash }} IMG_SHA256: ${{ steps.measure.outputs.img_sha256 }} + DRIVER_VER: ${{ steps.measure.outputs.driver_ver }} run: | set -eux OCI_REF="${OCI_IMAGE}:${OCI_TAG}" @@ -392,6 +441,7 @@ jobs: --annotation "com.cohere.kata.ref=${{ needs.meta.outputs.kata_ref }}" \ --annotation "com.cohere.guest-components.repo=${{ needs.meta.outputs.gc_repo }}" \ --annotation "com.cohere.guest-components.ref=${{ needs.meta.outputs.gc_ref }}" \ + --annotation "com.cohere.kata-uvm.nvidia-driver=${DRIVER_VER}" \ --annotation "com.cohere.kata-uvm.image-sha256=${IMG_SHA256}" \ --annotation "com.cohere.kata-uvm.root-hash=${ROOT_HASH}" \ --format json > oras-output.json @@ -423,6 +473,7 @@ jobs: echo "| Digest | \`${{ steps.push.outputs.digest }}\` |" echo "| kata-containers ref | \`${{ needs.meta.outputs.kata_ref }}\` |" echo "| guest-components | \`${{ needs.meta.outputs.gc_repo }}@${{ needs.meta.outputs.gc_ref }}\` |" + echo "| NVIDIA driver | \`${{ steps.measure.outputs.driver_ver }}\` |" echo "| NVIDIA stack | \`${{ needs.meta.outputs.nvidia_gpu_stack }}\` |" echo "| root_hash | \`${{ steps.measure.outputs.root_hash }}\` |" echo "| image sha256 | \`${{ steps.measure.outputs.img_sha256 }}\` |" From 6b51099ac1e49c3f4c404381ee8863403b112e25 Mon Sep 17 00:00:00 2001 From: Alhassan Khedr Date: Thu, 14 May 2026 23:58:58 -0400 Subject: [PATCH 6/8] ci: default gc_ref to alhassankhedr/sync-main-to-cohere (PR #9) The plain `cohere` branch of guest-components has a `count == 1` guard in `nvidia-attester::detect_platform()` that silently disables the attester on multi-GPU systems. Multi-GPU pods on 8x B200 boot fine but `/aa/additional_evidence` returns empty, which looks like a build issue but is actually the userspace attester refusing to register. Upstream main has a complete rewrite of nvidia-attester on top of the NVAT SDK (no `count == 1` check). PR #9 in cohere-ai/guest-components syncs that rewrite into our fork. Until PR #9 merges into `cohere`, default `gc_ref` to `alhassankhedr/sync-main-to-cohere` so kata UVM builds out of this workflow have a working multi-GPU attester. Switch back to `cohere` once PR #9 is merged. --- .github/workflows/build-kata-uvm-cohere.yaml | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-kata-uvm-cohere.yaml b/.github/workflows/build-kata-uvm-cohere.yaml index af8fd36aa8..4e2143c1f3 100644 --- a/.github/workflows/build-kata-uvm-cohere.yaml +++ b/.github/workflows/build-kata-uvm-cohere.yaml @@ -56,10 +56,20 @@ on: type: string default: "https://github.com/cohere-ai/guest-components.git" gc_ref: - description: "guest-components ref (branch, tag, or SHA)" + description: | + guest-components ref (branch, tag, or SHA). + + Default: alhassankhedr/sync-main-to-cohere (head of PR #9). + That branch carries upstream main's nvidia-attester rewrite + (NVAT SDK based, no `count == 1` guard) and is required for + multi-GPU evidence to work end-to-end on 8x B200 hosts. The + plain `cohere` branch still has the old NVML-based attester + which silently produces empty evidence on 2+ GPU systems + (mod a sed `s/count == 1/count >= 1/` patch the podvm-mkosi + Dockerfile applies). Switch back to `cohere` after PR #9 merges. required: false type: string - default: "cohere" + default: "alhassankhedr/sync-main-to-cohere" kata_nvidia_driver_ver: description: | Override .externals.nvidia.driver.version in kata's versions.yaml @@ -108,7 +118,7 @@ jobs: env: KATA_REF: ${{ inputs.kata_ref || '3.30.0' }} GC_REPO: ${{ inputs.gc_repo || 'https://github.com/cohere-ai/guest-components.git' }} - GC_REF: ${{ inputs.gc_ref || 'cohere' }} + GC_REF: ${{ inputs.gc_ref || 'alhassankhedr/sync-main-to-cohere' }} DRIVER_VER: ${{ inputs.kata_nvidia_driver_ver || '' }} STACK: ${{ inputs.nvidia_gpu_stack || 'compute,dcgm,nvswitch' }} SUFFIX: ${{ inputs.tag_suffix || '' }} From d5e0166804292fc5da300cf1979e5c6398006c1f Mon Sep 17 00:00:00 2001 From: Alhassan Khedr Date: Fri, 15 May 2026 00:26:24 -0400 Subject: [PATCH 7/8] ci: pin nvidia.nvat.version so attestation-agent-nv actually gets built MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a `kata_nvat_ver` workflow_dispatch input (default 2026.03.02) that rewrites `.externals.nvidia.nvat.{version,url,desc}` in kata's versions.yaml before the rootfs build. Why this matters: kata's tools/packaging/static-build/coco-guest-components/build.sh forwards NVAT_VERSION from versions.yaml to the GC builder Dockerfile. The Dockerfile gates the entire libnvat clone+cmake+install behind `if [ -n "${NVAT_VERSION}" ]`, and upstream kata 3.30.0 ships *without* that key set. Net effect on the cohere fork's UVM: * libnvat is never built into the GC builder image. * build-static-coco-guest-components.sh's second AA build pass — the one that compiles `attestation-agent` with `nvidia-attester` against /usr/local/lib/libnvat.so and installs the result as /usr/local/bin/attestation-agent-nv — silently no-ops because the required system lib is missing. * The rootfs ends up with only the standard, non-NVIDIA AA. Symbol fingerprint of the installed UVM confirms it: zero `nvmlDeviceGetCount`, zero `nv_attestation_sdk`, zero `libnvat`. * `/aa/additional_evidence` returns empty on multi-GPU pods regardless of which guest-components branch we baked. ITA appraisal can never see `nvgpu_overall: true`. Pins 2026.03.02 to match the version the podvm-mkosi side already builds against (NVAT_TAG in cloud-api-adaptor's Dockerfile.podvm_binaries.ubuntu). Tag, measurements.json, and OCI annotations all surface the pin so the binding is inspectable from the registry (`-nvat-` tag suffix, `nvat_sdk.version` field, `com.cohere.kata-uvm.nvat-sdk` annotation). --- .github/workflows/build-kata-uvm-cohere.yaml | 82 ++++++++++++++++++-- 1 file changed, 77 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-kata-uvm-cohere.yaml b/.github/workflows/build-kata-uvm-cohere.yaml index 4e2143c1f3..8099af0be4 100644 --- a/.github/workflows/build-kata-uvm-cohere.yaml +++ b/.github/workflows/build-kata-uvm-cohere.yaml @@ -81,6 +81,24 @@ on: required: false type: string default: "" + kata_nvat_ver: + description: | + Pin .externals.nvidia.nvat.version in kata's versions.yaml. Default + 2026.03.02 (a real tag in https://github.com/NVIDIA/attestation-sdk). + + Without this pin, kata's coco-guest-components builder Dockerfile + skips the libnvat build (its `if [ -n "${NVAT_VERSION}" ]` guard + short-circuits), which makes the second AA build pass — the one + that links the nvidia-attester cargo feature against + /usr/local/lib/libnvat.so and installs the result as + /usr/local/bin/attestation-agent-nv — silently no-op. Net effect: + the AA baked into the rootfs has no GPU evidence support + regardless of gc_ref, and /aa/additional_evidence on multi-GPU + pods returns empty. Set "" to leave nvat unpinned (matches + upstream kata 3.30.0 behaviour). + required: false + type: string + default: "2026.03.02" nvidia_gpu_stack: description: "NVIDIA GPU stack components (driver= is added from versions.yaml)" required: false @@ -111,6 +129,7 @@ jobs: gc_repo: ${{ steps.compute.outputs.gc_repo }} gc_ref: ${{ steps.compute.outputs.gc_ref }} kata_nvidia_driver_ver: ${{ steps.compute.outputs.kata_nvidia_driver_ver }} + kata_nvat_ver: ${{ steps.compute.outputs.kata_nvat_ver }} nvidia_gpu_stack: ${{ steps.compute.outputs.nvidia_gpu_stack }} steps: - name: Compute tag and inputs @@ -120,12 +139,13 @@ jobs: GC_REPO: ${{ inputs.gc_repo || 'https://github.com/cohere-ai/guest-components.git' }} GC_REF: ${{ inputs.gc_ref || 'alhassankhedr/sync-main-to-cohere' }} DRIVER_VER: ${{ inputs.kata_nvidia_driver_ver || '' }} + NVAT_VER: ${{ inputs.kata_nvat_ver || '2026.03.02' }} STACK: ${{ inputs.nvidia_gpu_stack || 'compute,dcgm,nvswitch' }} SUFFIX: ${{ inputs.tag_suffix || '' }} run: | # Tag pattern: # kata-uvm-v* push -> use the tag literal (after stripping `kata-uvm-`) - # workflow_dispatch -> kata-${KATA_REF}-gc-${GC_REF_SHORT}[-drv-][suffix] + # workflow_dispatch -> kata-${KATA_REF}-gc-${GC_REF_SHORT}[-drv-][-nvat-][suffix] # branch push -> cohere-latest if [[ "$GITHUB_REF" == refs/tags/kata-uvm-v* ]]; then TAG="${GITHUB_REF#refs/tags/kata-uvm-}" @@ -137,6 +157,10 @@ jobs: # OCI tag so the artifact name unambiguously identifies which # driver is baked in. [ -n "$DRIVER_VER" ] && TAG="${TAG}-drv-${DRIVER_VER}" + # Same for the NVAT SDK pin: the artifact ABI changes meaningfully + # between NVAT releases (libnvat soname + GpuEvidenceSource API), + # so make this visible too. + [ -n "$NVAT_VER" ] && TAG="${TAG}-nvat-${NVAT_VER}" else TAG="cohere-latest" fi @@ -149,6 +173,7 @@ jobs: echo "gc_repo=$GC_REPO" echo "gc_ref=$GC_REF" echo "kata_nvidia_driver_ver=$DRIVER_VER" + echo "kata_nvat_ver=$NVAT_VER" echo "nvidia_gpu_stack=$STACK" } >> "$GITHUB_OUTPUT" @@ -257,6 +282,40 @@ jobs: echo "----- updated versions.yaml (nvidia.driver) -----" yq '.externals.nvidia.driver' versions.yaml + - name: Pin NVIDIA Attestation SDK (libnvat) in versions.yaml + # kata's tools/packaging/static-build/coco-guest-components/build.sh + # reads `.externals.nvidia.nvat.version` and forwards it to the GC + # builder Dockerfile as NVAT_VERSION. The Dockerfile gates the entire + # libnvat clone+cmake+install behind `if [ -n "${NVAT_VERSION}" ]`, + # so an unset key (the default in upstream kata 3.30.0) means no + # libnvat in the builder image. Without libnvat, + # build-static-coco-guest-components.sh's second build pass — the one + # that compiles AA with `nvidia-attester` against + # /usr/local/lib/libnvat.so and installs the result as + # /usr/local/bin/attestation-agent-nv — silently no-ops, and the + # rootfs ends up with only the no-NVIDIA AA. /aa/additional_evidence + # then returns empty regardless of which guest-components we baked. + # Pin a real attestation-sdk tag here so libnvat actually gets built. + if: needs.meta.outputs.kata_nvat_ver != '' + env: + NVAT_VER: ${{ needs.meta.outputs.kata_nvat_ver }} + run: | + set -eux + cd /tmp/kata + OLD_VER=$(yq '.externals.nvidia.nvat.version // ""' versions.yaml | tr -d '"') + yq -i \ + ".externals.nvidia.nvat.version = \"${NVAT_VER}\" | + .externals.nvidia.nvat.url = \"https://github.com/NVIDIA/attestation-sdk\" | + .externals.nvidia.nvat.desc = \"NVIDIA Attestation SDK (libnvat); enables attestation-agent-nv\"" \ + versions.yaml + if [[ -z "$OLD_VER" || "$OLD_VER" == "null" ]]; then + echo "NVAT SDK pin: (unset) -> ${NVAT_VER}" + else + echo "NVAT SDK pin: ${OLD_VER} -> ${NVAT_VER}" + fi + echo "----- updated versions.yaml (nvidia.nvat) -----" + yq '.externals.nvidia.nvat' versions.yaml + - name: Build rootfs-image-nvidia-gpu-confidential env: NVIDIA_GPU_STACK: ${{ needs.meta.outputs.nvidia_gpu_stack }} @@ -363,17 +422,19 @@ jobs: # a dangling symlink or an empty file. [[ "$IMG_BYTES" -gt 104857600 ]] || { echo "image suspiciously small: $IMG_BYTES bytes"; exit 1; } - # Resolve the *actual* baked-in driver pin from versions.yaml so - # the artifact reports what is really installed, not just what - # was requested. (When kata_nvidia_driver_ver is empty we want - # kata's default to be reflected here.) + # Resolve the *actual* baked-in driver / nvat pins from versions.yaml + # so the artifact reports what is really installed, not just what + # was requested. (When the kata_* inputs are empty we want kata's + # default to be reflected here.) DRIVER_VER=$(yq '.externals.nvidia.driver.version' /tmp/kata/versions.yaml | tr -d '"') + NVAT_VER=$(yq '.externals.nvidia.nvat.version // ""' /tmp/kata/versions.yaml | tr -d '"') jq -n \ --arg kata_ref "${{ needs.meta.outputs.kata_ref }}" \ --arg gc_repo "${{ needs.meta.outputs.gc_repo }}" \ --arg gc_ref "${{ needs.meta.outputs.gc_ref }}" \ --arg driver_ver "$DRIVER_VER" \ + --arg nvat_ver "$NVAT_VER" \ --arg nvidia_stack "${{ needs.meta.outputs.nvidia_gpu_stack }}" \ --arg root_hash "$ROOT_HASH" \ --arg salt "$SALT" \ @@ -388,6 +449,8 @@ jobs: kata_ref: $kata_ref, guest_components: {repo: $gc_repo, ref: $gc_ref}, nvidia_driver: {version: $driver_ver}, + nvat_sdk: ( if $nvat_ver == "" or $nvat_ver == "null" then null + else {version: $nvat_ver} end ), nvidia_gpu_stack: $nvidia_stack, dm_verity: { root_hash: $root_hash, @@ -409,6 +472,7 @@ jobs: echo "root_hash=$ROOT_HASH" >> "$GITHUB_OUTPUT" echo "img_sha256=$IMG_SHA256" >> "$GITHUB_OUTPUT" echo "driver_ver=$DRIVER_VER" >> "$GITHUB_OUTPUT" + echo "nvat_ver=$NVAT_VER" >> "$GITHUB_OUTPUT" - name: Compress .image for transport run: | @@ -434,10 +498,16 @@ jobs: ROOT_HASH: ${{ steps.measure.outputs.root_hash }} IMG_SHA256: ${{ steps.measure.outputs.img_sha256 }} DRIVER_VER: ${{ steps.measure.outputs.driver_ver }} + NVAT_VER: ${{ steps.measure.outputs.nvat_ver }} run: | set -eux OCI_REF="${OCI_IMAGE}:${OCI_TAG}" cd /tmp/uvm-out + # NVAT annotation is conditional: an empty string would push a + # value-less label which is misleading. + NVAT_ANNOTATION=() + [[ -n "$NVAT_VER" ]] && NVAT_ANNOTATION+=(--annotation "com.cohere.kata-uvm.nvat-sdk=${NVAT_VER}") + oras push "$OCI_REF" \ kata-containers-nvidia-gpu-confidential.img.zst:application/vnd.cohere.kata-uvm.image+zstd \ root_hash.txt:application/vnd.cohere.kata-uvm.verity+plain \ @@ -452,6 +522,7 @@ jobs: --annotation "com.cohere.guest-components.repo=${{ needs.meta.outputs.gc_repo }}" \ --annotation "com.cohere.guest-components.ref=${{ needs.meta.outputs.gc_ref }}" \ --annotation "com.cohere.kata-uvm.nvidia-driver=${DRIVER_VER}" \ + "${NVAT_ANNOTATION[@]}" \ --annotation "com.cohere.kata-uvm.image-sha256=${IMG_SHA256}" \ --annotation "com.cohere.kata-uvm.root-hash=${ROOT_HASH}" \ --format json > oras-output.json @@ -484,6 +555,7 @@ jobs: echo "| kata-containers ref | \`${{ needs.meta.outputs.kata_ref }}\` |" echo "| guest-components | \`${{ needs.meta.outputs.gc_repo }}@${{ needs.meta.outputs.gc_ref }}\` |" echo "| NVIDIA driver | \`${{ steps.measure.outputs.driver_ver }}\` |" + echo "| NVAT SDK | \`${{ steps.measure.outputs.nvat_ver || '(unset — attestation-agent-nv NOT built)' }}\` |" echo "| NVIDIA stack | \`${{ needs.meta.outputs.nvidia_gpu_stack }}\` |" echo "| root_hash | \`${{ steps.measure.outputs.root_hash }}\` |" echo "| image sha256 | \`${{ steps.measure.outputs.img_sha256 }}\` |" From a55b3acb2a395945633422b19bd559087662927f Mon Sep 17 00:00:00 2001 From: Alhassan Khedr Date: Fri, 15 May 2026 12:25:39 -0400 Subject: [PATCH 8/8] ci(kata-uvm): ship paired kernel binary alongside rootfs to fix Bug F kata's kernel-nvidia-gpu build emits a fresh random certs/signing_key.pem per invocation; the NVIDIA modules baked into kata-static-kernel-nvidia-gpu-modules.tar.zst (and therefore into the rootfs) are signed against THAT key. If the host launches our UVM against a kernel from a different build (e.g. the kata-deploy-bundled one), every NVIDIA .ko is rejected at first modprobe with "Loading of unsigned module is rejected", NVRC panics in src/execute.rs:24:9, the guest powers down, and pods sit in Pending forever. Verified end-to-end on the B200 host on 2026-05-15 (README "Bug F"). The host-side fix lives in fortress's 05-install-uvm.sh, which atomically installs both the rootfs symlink and the kernel binary. For that to work, the OCI artifact has to ship the kernel. Mirror the local build pipeline (04-build-uvm-locally.sh) here: * Force a clean kernel + modules + rootfs rebuild whenever kata_nvidia_driver_ver is overridden, so kata's make can't reuse a cached kernel-nvidia-gpu builddir whose embedded signing key doesn't match the new modules tarball. * After "Build rootfs", stage the locally-built vmlinuz (+ vmlinux, System.map, config) into /tmp/uvm-out alongside the rootfs and write kernel.basename as a single source of truth for the install side. * Add a defensive signing-key sanity check that extracts the SKID from kernel-nvidia-gpu/builddir/.../certs/signing_key.x509 and confirms it appears in the trailing PKCS#7 signature of nvidia.ko. Fails the build if the modules tarball is signed by a different key than the kernel embeds. * Extend measurements.json with .kernel.{filename,sha256} so 05-install-uvm.sh can validate the kernel post-pull. * Push the kernel files (vmlinuz/vmlinux/System.map/config and kernel.basename) into the OCI artifact with media type application/vnd.cohere.kata-uvm.kernel+octet-stream, and surface the kernel-basename + kernel-sha256 as OCI annotations. After this, the UVM artifact is self-contained: pulling and installing it places a kernel and rootfs that share a signing key, so guest modprobe of nvidia.ko / nvidia-uvm.ko / nvidia-modeset.ko / nvidia-drm.ko / nvidia-peermem.ko succeeds and NVRC boots cleanly. --- .github/workflows/build-kata-uvm-cohere.yaml | 140 +++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/.github/workflows/build-kata-uvm-cohere.yaml b/.github/workflows/build-kata-uvm-cohere.yaml index 8099af0be4..f7a16ab177 100644 --- a/.github/workflows/build-kata-uvm-cohere.yaml +++ b/.github/workflows/build-kata-uvm-cohere.yaml @@ -316,6 +316,33 @@ jobs: echo "----- updated versions.yaml (nvidia.nvat) -----" yq '.externals.nvidia.nvat' versions.yaml + - name: Force a clean kernel + rootfs rebuild when overriding the driver + # Mirrors KATA_NVIDIA_FORCE_REBUILD in + # fortress/scratch/oci-b200/k8s/04-build-uvm-locally.sh. Each kata + # kernel build generates a fresh random `certs/signing_key.pem`, + # and the NVIDIA modules in + # `kata-static-kernel-nvidia-gpu-modules.tar.zst` are signed + # against THAT key. If kata's make reuses any cached kernel- + # nvidia-gpu/ artifacts while we're trying to bump the driver + # version, we end up with userspace<->kernel ABI skew or, worse, + # NVIDIA `.ko` files signed against a different key than the + # one the kernel binary embeds (README "Bug F"). On a fresh CI + # runner this is a no-op, but if we ever start caching the + # kata checkout between runs (or someone re-runs a job with a + # different driver_ver) the wipe makes the build deterministic. + if: needs.meta.outputs.kata_nvidia_driver_ver != '' + run: | + set -eux + BUILD=/tmp/kata/tools/packaging/kata-deploy/local-build/build + rm -rf "$BUILD/kernel-nvidia-gpu" \ + "$BUILD/kata-static-kernel-nvidia-gpu-modules.tar.zst" \ + "$BUILD/kata-static-kernel-nvidia-gpu.tar.zst" \ + "$BUILD/rootfs-image-nvidia-gpu-confidential" \ + "$BUILD/rootfs-nvidia-gpu-confidential-stage-one" \ + "$BUILD/kata-static-rootfs-image-nvidia-gpu-confidential.tar.zst" \ + 2>/dev/null || true + echo "wiped kernel-nvidia-gpu/, modules tarball, and rootfs build dirs" + - name: Build rootfs-image-nvidia-gpu-confidential env: NVIDIA_GPU_STACK: ${{ needs.meta.outputs.nvidia_gpu_stack }} @@ -379,6 +406,78 @@ jobs: cat /tmp/uvm-out/root_hash.txt file /tmp/uvm-out/kata-containers-nvidia-gpu-confidential.img + - name: Stage paired kernel binary alongside the rootfs + # WHY: kata's kernel-nvidia-gpu build emits a fresh random + # `certs/signing_key.pem` per invocation and uses it to sign both + # the embedded NVIDIA modules and (transitively) the modules + # tarball that nvidia_rootfs.sh extracts into the rootfs. The + # rootfs we produced in the previous step therefore carries + # NVIDIA `.ko` files signed against THIS build's key. If the + # host running the resulting UVM uses a kernel from a different + # build (e.g. the kata-deploy-bundled one), the modules are + # rejected at first modprobe, NVRC panics, the guest powers + # down, and pods sit in Pending. Verified end-to-end on + # 2026-05-15 (README "Bug F"). The fix on the install side + # (fortress/scratch/oci-b200/k8s/05-install-uvm.sh) is to + # atomically install both the kernel and the rootfs from the + # same build. For that to work, the OCI artifact has to ship + # the kernel binary alongside the rootfs. + run: | + set -euxo pipefail + KBUILD_DESTDIR=/tmp/kata/tools/packaging/kata-deploy/local-build/build/kernel-nvidia-gpu/destdir/opt/kata/share/kata-containers + KVER_FILE=$(ls "${KBUILD_DESTDIR}"/vmlinuz-*-nvidia-gpu 2>/dev/null | head -n1 || true) + if [[ -z "$KVER_FILE" ]]; then + echo "FATAL: no locally-built kernel at ${KBUILD_DESTDIR}/vmlinuz-*-nvidia-gpu" >&2 + exit 1 + fi + KVER_BASENAME=$(basename "$KVER_FILE") + KVER_VERSION="${KVER_BASENAME#vmlinuz-}" + cp -p "$KVER_FILE" "/tmp/uvm-out/${KVER_BASENAME}" + for sib in "vmlinux-${KVER_VERSION}" "System.map-${KVER_VERSION}" "config-${KVER_VERSION}"; do + [[ -f "${KBUILD_DESTDIR}/${sib}" ]] && cp -p "${KBUILD_DESTDIR}/${sib}" /tmp/uvm-out/ + done + # Single source of truth for which kernel pairs with this rootfs; + # 05-install-uvm.sh reads this on the install side. + echo "$KVER_BASENAME" > /tmp/uvm-out/kernel.basename + ls -lh /tmp/uvm-out/ + + - name: Verify NVIDIA modules signing key matches the kernel + # Defensive check that the SKID embedded in the kernel's + # `certs/signing_key.x509` appears (raw-bytes hex-encoded) + # somewhere in the trailing PKCS#7 signature of the NVIDIA + # modules tarball's nvidia.ko. Same gate fortress's + # 04-build-uvm-locally.sh applies. On a clean CI runner this + # should always pass; if it ever fails we catch it here, in + # CI, instead of via guest serial capture in production. + run: | + set -euxo pipefail + KBASENAME=$(cat /tmp/uvm-out/kernel.basename) + KVER_VERSION="${KBASENAME#vmlinuz-}" + SIGNING_X509=/tmp/kata/tools/packaging/kata-deploy/local-build/build/kernel-nvidia-gpu/builddir/kata-linux-${KVER_VERSION}/certs/signing_key.x509 + MODULES_TARBALL=/tmp/kata/tools/packaging/kata-deploy/local-build/build/kata-static-kernel-nvidia-gpu-modules.tar.zst + if [[ ! -f "$SIGNING_X509" || ! -f "$MODULES_TARBALL" ]]; then + echo "WARN: skipping signing-key check (missing $SIGNING_X509 or $MODULES_TARBALL)" + exit 0 + fi + KEY_SKID=$(openssl x509 -in "$SIGNING_X509" -noout -text \ + | awk '/X509v3 Subject Key Identifier/{getline; gsub(/[: ]/,""); print tolower($0); exit}') + if [[ -z "$KEY_SKID" ]]; then + echo "WARN: could not extract SKID from $SIGNING_X509"; exit 0 + fi + TMP=$(mktemp -d) + tar --zstd -xf "$MODULES_TARBALL" -C "$TMP" --wildcards '*/kernel/drivers/video/nvidia.ko' + SAMPLE_KO=$(find "$TMP" -name nvidia.ko -print -quit) + if [[ -z "$SAMPLE_KO" ]]; then + echo "WARN: no nvidia.ko in $MODULES_TARBALL"; exit 0 + fi + if xxd -p -c 999999 "$SAMPLE_KO" | grep -qi "$KEY_SKID"; then + echo "OK: nvidia.ko signed by this build's signing key (SKID=$KEY_SKID)" + else + echo "FATAL: nvidia.ko in modules tarball is NOT signed by the kernel's signing_key.x509 (SKID=$KEY_SKID)" + echo " guest will reject NVIDIA modules at first modprobe; pod will sit in Pending" + exit 1 + fi + - name: Surface verity params as JSON metadata id: measure # The root_hash.txt file is the source of truth for kata's @@ -422,6 +521,17 @@ jobs: # a dangling symlink or an empty file. [[ "$IMG_BYTES" -gt 104857600 ]] || { echo "image suspiciously small: $IMG_BYTES bytes"; exit 1; } + # Paired kernel binary surfaced by the "Stage paired kernel" step. + # The install side (fortress/scratch/oci-b200/k8s/05-install-uvm.sh) + # uses .kernel.{filename,sha256} from measurements.json to validate + # and atomically install the kernel alongside the rootfs. + KERNEL_BASENAME="" + KERNEL_SHA="" + if [[ -f /tmp/uvm-out/kernel.basename ]]; then + KERNEL_BASENAME=$(cat /tmp/uvm-out/kernel.basename) + KERNEL_SHA=$(sha256sum "/tmp/uvm-out/${KERNEL_BASENAME}" | awk '{print $1}') + fi + # Resolve the *actual* baked-in driver / nvat pins from versions.yaml # so the artifact reports what is really installed, not just what # was requested. (When the kata_* inputs are empty we want kata's @@ -443,6 +553,8 @@ jobs: --arg hash_block_sz "$HASH_BLOCK_SIZE" \ --arg img_sha256 "$IMG_SHA256" \ --arg img_bytes "$IMG_BYTES" \ + --arg kernel_name "$KERNEL_BASENAME" \ + --arg kernel_sha "$KERNEL_SHA" \ --arg caa_commit "$GITHUB_SHA" \ --arg build_date "$(date -u +%Y-%m-%dT%H:%M:%SZ)" \ '{ @@ -464,6 +576,8 @@ jobs: sha256: $img_sha256, bytes: ($img_bytes | tonumber) }, + kernel: ( if $kernel_name == "" then null + else {filename: $kernel_name, sha256: $kernel_sha} end ), source: {caa_commit: $caa_commit, build_date: $build_date} }' > /tmp/uvm-out/measurements.json @@ -473,6 +587,8 @@ jobs: echo "img_sha256=$IMG_SHA256" >> "$GITHUB_OUTPUT" echo "driver_ver=$DRIVER_VER" >> "$GITHUB_OUTPUT" echo "nvat_ver=$NVAT_VER" >> "$GITHUB_OUTPUT" + echo "kernel_basename=$KERNEL_BASENAME" >> "$GITHUB_OUTPUT" + echo "kernel_sha=$KERNEL_SHA" >> "$GITHUB_OUTPUT" - name: Compress .image for transport run: | @@ -508,10 +624,30 @@ jobs: NVAT_ANNOTATION=() [[ -n "$NVAT_VER" ]] && NVAT_ANNOTATION+=(--annotation "com.cohere.kata-uvm.nvat-sdk=${NVAT_VER}") + # The paired kernel (and its sibling artifacts) MUST ride along + # with the rootfs in the same OCI artifact so 05-install-uvm.sh + # can install both atomically. See README "Bug F" for the full + # mechanism. We push the kernel uncompressed (~80 MiB raw); zstd + # would only shave ~20 MiB and complicates the install side. + KERNEL_FILES=() + if [[ -f kernel.basename ]]; then + KBASENAME=$(cat kernel.basename) + KVER_VERSION="${KBASENAME#vmlinuz-}" + for kf in "$KBASENAME" "kernel.basename" \ + "vmlinux-${KVER_VERSION}" \ + "System.map-${KVER_VERSION}" \ + "config-${KVER_VERSION}"; do + if [[ -f "$kf" ]]; then + KERNEL_FILES+=( "${kf}:application/vnd.cohere.kata-uvm.kernel+octet-stream" ) + fi + done + fi + oras push "$OCI_REF" \ kata-containers-nvidia-gpu-confidential.img.zst:application/vnd.cohere.kata-uvm.image+zstd \ root_hash.txt:application/vnd.cohere.kata-uvm.verity+plain \ measurements.json:application/vnd.cohere.kata-uvm.measurements+json \ + "${KERNEL_FILES[@]}" \ --annotation "org.opencontainers.image.title=kata-uvm-nvidia-gpu-confidential" \ --annotation "org.opencontainers.image.description=Kata Containers NVIDIA GPU confidential UVM image, built from source with cohere-ai/guest-components" \ --annotation "org.opencontainers.image.source=https://github.com/${GITHUB_REPOSITORY}" \ @@ -525,6 +661,8 @@ jobs: "${NVAT_ANNOTATION[@]}" \ --annotation "com.cohere.kata-uvm.image-sha256=${IMG_SHA256}" \ --annotation "com.cohere.kata-uvm.root-hash=${ROOT_HASH}" \ + --annotation "com.cohere.kata-uvm.kernel-basename=${{ steps.measure.outputs.kernel_basename }}" \ + --annotation "com.cohere.kata-uvm.kernel-sha256=${{ steps.measure.outputs.kernel_sha }}" \ --format json > oras-output.json cat oras-output.json @@ -559,6 +697,8 @@ jobs: echo "| NVIDIA stack | \`${{ needs.meta.outputs.nvidia_gpu_stack }}\` |" echo "| root_hash | \`${{ steps.measure.outputs.root_hash }}\` |" echo "| image sha256 | \`${{ steps.measure.outputs.img_sha256 }}\` |" + echo "| kernel | \`${{ steps.measure.outputs.kernel_basename }}\` |" + echo "| kernel sha256 | \`${{ steps.measure.outputs.kernel_sha }}\` |" echo "" echo "Install on a B200 host with:" echo ""