From a0a98820f56910c5a0f84d418bf751d959cdff62 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Mon, 29 Jun 2026 12:00:50 -0700 Subject: [PATCH 1/3] feat(gpu): pre-bake CUDA driver into the VHD + tear it down where it isn't installed Splits the bake + teardown half of #8661 into its own PR (the consume/skip-build half ships separately). Bake and teardown are intentionally COUPLED: baking the driver into the shared Ubuntu VHD installs userspace libs + (setuid) binaries + a DKMS-registered module, so any node that does NOT install the AKS-managed driver must tear it down -- non-GPU VMs AND GPU VMs that opt out via --gpu-driver None or the skip toggle/tag (the cleanUpGPUDrivers path). Marker-gated: no-op on non-prebaked VHDs, never decoupled from the bake. - install-dependencies.sh: opt-in (FEATURE_FLAGS=NVIDIA_CUDA_PREBAKE, default off) build-only bake of the NVIDIA kernel module + libc6-dev toolchain. - cse_install_ubuntu.sh: cleanUpPrebakedGPUDriver removes the installed driver (libs, setuid nvidia-* binaries, DKMS reg, ld config, marker) via a fast deregister. Stage-1 observability (greppable AKS_GPU_PREBAKE log lines, for the staged rollout): - teardown emits event=teardown status=cleaned|incomplete (fleet-wide security-coverage signal; incomplete = a setuid nvidia binary / DKMS registration lingered). - managed GPU nodes emit event=managed_gpu marker_present/driver_kind_match, so the rollout can confirm CUDA GPU nodes are ready before enabling stage-2 skip-build. Validation: shellspec 739/0, generate-testdata clean, shellcheck clean. Reference (full original, kept open): #8661. Signed-off-by: Ganeshkumar Ashokavardhanan --- .../linux/cloud-init/artifacts/cse_config.sh | 17 +++++++ .../artifacts/ubuntu/cse_install_ubuntu.sh | 51 +++++++++++++++++++ .../cloud-init/artifacts/cse_config_spec.sh | 33 ++++++++++++ .../artifacts/cse_install_ubuntu_spec.sh | 49 ++++++++++++++++++ vhdbuilder/packer/install-dependencies.sh | 27 ++++++++++ 5 files changed, 177 insertions(+) create mode 100644 spec/parts/linux/cloud-init/artifacts/cse_install_ubuntu_spec.sh diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index ff146905b4b..238ab7398f1 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -1295,6 +1295,22 @@ validateGPUDrivers() { fi } +# logGPUDriverPrebakeReadiness emits a stage-1 observability signal on a managed GPU node: whether +# the aks-gpu prebake marker is present and matches this node's driver kind -- i.e. whether stage-2 +# (skip-build) would take the fast path. Lets the rollout confirm managed CUDA GPU nodes are ready +# before enabling consume. Observability only; no behavior change. +logGPUDriverPrebakeReadiness() { + local marker="${GPU_DKMS_MARKER_FILE:-/opt/azure/aks-gpu/dkms-marker}" + local marker_present=false driver_kind_match=false + if [ -f "${marker}" ]; then + marker_present=true + if [ "$(sed -n 's/^driver_kind=//p' "${marker}" | head -n1)" = "${NVIDIA_GPU_DRIVER_TYPE}" ]; then + driver_kind_match=true + fi + fi + echo "AKS_GPU_PREBAKE event=managed_gpu driver_type=${NVIDIA_GPU_DRIVER_TYPE:-} marker_present=${marker_present} driver_kind_match=${driver_kind_match}" +} + ensureGPUDrivers() { if [ "$(isARM64)" -eq 1 ]; then return @@ -1307,6 +1323,7 @@ ensureGPUDrivers() { fi if [ "$OS" = "$UBUNTU_OS_NAME" ]; then logs_to_events "AKS.CSE.ensureGPUDrivers.nvidia-modprobe" "systemctlEnableAndStart nvidia-modprobe 30" || exit $ERR_GPU_DRIVERS_START_FAIL + logGPUDriverPrebakeReadiness fi } diff --git a/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh b/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh index dfe68bfa8e5..162cf9b012b 100755 --- a/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh +++ b/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh @@ -223,12 +223,63 @@ removeNvidiaRepos() { fi } +# cleanUpPrebakedGPUDriver removes a CUDA driver pre-baked into the shared VHD on any node that does +# NOT install the AKS-managed driver -- the cleanUpGPUDrivers path (GPU_NODE != true OR +# skip_nvidia_driver_install=true): non-GPU VMs, and GPU VMs opted out via --gpu-driver None or the +# skip toggle/tag. There the driver is dead weight (wasted disk; nvidia.ko rebuilt on every kernel +# patch) and, on an opted-out GPU node, unused attack surface. The module is never loaded on these +# nodes (ensureGPUDrivers doesn't run), so deregistration is safe. No-op unless the marker exists. +cleanUpPrebakedGPUDriver() { + local marker="${GPU_DKMS_MARKER_FILE:-/opt/azure/aks-gpu/dkms-marker}" + if [ ! -f "${marker}" ]; then + return 0 + fi + echo "Removing pre-baked NVIDIA driver inherited from shared VHD (node does not install the managed driver)" + local dkms_before=false + [ -d /var/lib/dkms/nvidia ] && dkms_before=true + + # Deregister the nvidia DKMS module by removing its source tree (avoids the slow `dkms remove + # --all`, ~35s). The module isn't loaded here, so no depmod/initramfs refresh is needed. + rm -rf /var/lib/dkms/nvidia || true + rm -f /lib/modules/*/updates/dkms/nvidia*.ko* 2>/dev/null || true + # The prebake stages libs under the aks-gpu *container's* GPU_DEST=/usr/bin (aks-gpu config.sh), + # NOT this script's GPU_DEST=/usr/local/nvidia -- so clear /usr/bin. + rm -rf /usr/bin/lib64 || true + # Remove the driver binaries too (same /usr/bin) so the node is genuinely driver-free -- else + # e.g. nvidia-smi stays on PATH but errors once its libs are gone. + for nvidiaBin in nvidia-smi nvidia-debugdump nvidia-persistenced nvidia-cuda-mps-control \ + nvidia-cuda-mps-server nvidia-modprobe nvidia-bug-report.sh nvidia-powerd \ + nvidia-ngx-updater nvidia-sleep.sh; do + rm -f "/usr/bin/${nvidiaBin}" || true + done + rm -f /etc/ld.so.conf.d/nvidia.conf || true + ldconfig || true + rm -f "${marker}" || true + + # Stage-1 observability: confirm the prebaked driver is actually gone. status=incomplete means a + # setuid nvidia binary / DKMS registration lingered (a security-coverage gap to investigate); + # status=cleaned counts toward fleet-wide coverage. Greppable prefix AKS_GPU_PREBAKE. + local marker_after=false dkms_after=false status=cleaned + [ -f "${marker}" ] && marker_after=true + [ -d /var/lib/dkms/nvidia ] && dkms_after=true + if [ "${marker_after}" = true ] || [ "${dkms_after}" = true ]; then + status=incomplete + fi + echo "AKS_GPU_PREBAKE event=teardown gpu_node=${GPU_NODE:-} status=${status} dkms_before=${dkms_before} marker_after=${marker_after} dkms_after=${dkms_after}" +} + cleanUpGPUDrivers() { rm -Rf $GPU_DEST /opt/gpu for packageName in $(managedGPUPackageList); do rm -rf "/opt/${packageName}" done + + # A CUDA driver pre-baked into a shared Ubuntu VHD is dead weight on a node that doesn't install + # the managed driver (non-GPU, or GPU opted out via --gpu-driver None / skip), and while + # DKMS-registered it forces an nvidia.ko rebuild on every kernel patch. Tear it down here. + # No-op on VHDs without the aks-gpu prebake marker. + cleanUpPrebakedGPUDriver } installCriCtlPackage() { diff --git a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh index bc98214c428..8b3cf29ca28 100755 --- a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh @@ -18,6 +18,39 @@ Describe 'cse_config.sh' Include "./parts/linux/cloud-init/artifacts/cse_config.sh" Include "./parts/linux/cloud-init/artifacts/cse_helpers.sh" + Describe 'logGPUDriverPrebakeReadiness' + It 'reports marker_present=false when no prebake marker exists' + GPU_DKMS_MARKER_FILE="/tmp/aks-gpu-readiness-absent-$$" + NVIDIA_GPU_DRIVER_TYPE="cuda" + When call logGPUDriverPrebakeReadiness + The output should include "AKS_GPU_PREBAKE event=managed_gpu" + The output should include "marker_present=false" + The output should include "driver_kind_match=false" + End + + It 'reports marker_present=true driver_kind_match=true when the marker matches the node driver kind' + marker="$(mktemp)" + printf 'driver_kind=cuda\n' > "$marker" + GPU_DKMS_MARKER_FILE="$marker" + NVIDIA_GPU_DRIVER_TYPE="cuda" + When call logGPUDriverPrebakeReadiness + The output should include "marker_present=true" + The output should include "driver_kind_match=true" + rm -f "$marker" + End + + It 'reports driver_kind_match=false when a CUDA marker is on a GRID node (not skip-ready)' + marker="$(mktemp)" + printf 'driver_kind=cuda\n' > "$marker" + GPU_DKMS_MARKER_FILE="$marker" + NVIDIA_GPU_DRIVER_TYPE="grid" + When call logGPUDriverPrebakeReadiness + The output should include "marker_present=true" + The output should include "driver_kind_match=false" + rm -f "$marker" + End + End + Describe 'configureAzureJson' AZURE_JSON_PATH="azure.json" AKS_CUSTOM_CLOUD_JSON_PATH="customcloud.json" diff --git a/spec/parts/linux/cloud-init/artifacts/cse_install_ubuntu_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_install_ubuntu_spec.sh new file mode 100644 index 00000000000..a5e73976607 --- /dev/null +++ b/spec/parts/linux/cloud-init/artifacts/cse_install_ubuntu_spec.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +Describe 'cse_install_ubuntu.sh' + Include "./parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh" + + Describe 'cleanUpPrebakedGPUDriver' + It 'is a no-op when the prebake marker is absent' + GPU_DKMS_MARKER_FILE="/tmp/aks-gpu-marker-absent-$$" + When call cleanUpPrebakedGPUDriver + The status should be success + The output should equal "" + End + + It 'deregisters the nvidia DKMS module and removes baked artifacts (libs, binaries, marker) when present' + marker="$(mktemp)" + GPU_DKMS_MARKER_FILE="${marker}" + rm() { echo "mock rm $*"; } + ldconfig() { echo "mock ldconfig"; } + When call cleanUpPrebakedGPUDriver + The status should be success + The output should include "Removing pre-baked NVIDIA driver" + # deregisters via the DKMS source tree + built module removal (no slow dkms remove) + The output should include "mock rm -rf /var/lib/dkms/nvidia" + The output should include "mock rm -f /lib/modules" + # relocated userspace libs + The output should include "mock rm -rf /usr/bin/lib64" + # driver userspace binaries so nvidia-smi becomes "command not found" on non-GPU nodes + The output should include "mock rm -f /usr/bin/nvidia-smi" + The output should include "mock ldconfig" + # the slow per-version dkms remove --all must NOT be on the critical path anymore + The output should not include "dkms remove" + # stage-1 observability: a structured outcome line is emitted. Here rm is mocked, so the + # marker is left in place and the cleanup correctly reports an incomplete (security-gap) result. + The output should include "AKS_GPU_PREBAKE event=teardown" + The output should include "status=incomplete" + End + + It 'reports status=cleaned once the marker and DKMS state are actually gone' + marker="$(mktemp)" + GPU_DKMS_MARKER_FILE="${marker}" + ldconfig() { echo "mock ldconfig"; } + When call cleanUpPrebakedGPUDriver + The status should be success + The output should include "AKS_GPU_PREBAKE event=teardown" + The output should include "status=cleaned" + The output should include "marker_after=false" + End + End +End diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 2dd451c48f2..0139d00a16d 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -737,6 +737,33 @@ if [ $OS = $UBUNTU_OS_NAME ] && [ "$(isARM64)" -ne 1 ]; then # No ARM64 SKU wit cat << EOF >> ${VHD_LOGS_FILEPATH} - nvidia-cuda-driver=${NVIDIA_DRIVER_IMAGE_TAG} EOF + + # Opt-in: pre-build the NVIDIA kernel module into the VHD so node provisioning skips the + # ~100s in-CSE DKMS compile. The aks-gpu container is run in "build-only" mode: it compiles + # and DKMS-registers the kernel module + stages userspace libs against THIS VHD's kernel, + # performs NO device access (safe on the GPU-less Packer builder), and writes the marker + # /opt/azure/aks-gpu/dkms-marker. At node boot, configGPUDrivers passes "install-skip-build" + # when that marker matches, running only the device-dependent steps. + # The driver image is intentionally LEFT in the VHD: boot-time device init still sources the + # container toolkit debs, fabric manager, containerd runtime config and udev rules from it. + # Dropping the image is a separate, deferred size optimization. + if grep -q "NVIDIA_CUDA_PREBAKE" <<< "$FEATURE_FLAGS"; then + echo "Pre-building NVIDIA CUDA kernel module into the VHD (build-only) for kernel $(uname -r)" + # nvidia-installer needs gcc/make + libc6-dev to compile; the builder lacks them here, so install + # them. A boot-time fallback recompile (marker mismatch) still has them: the VHD ships + # build-essential -> libc6-dev (release-notes manifests) -- the toolchain baseline GPU nodes compile + # with at boot (installDeps runs apt --no-install-recommends, so gcc alone doesn't pull libc6-dev). + apt_get_install 10 2 300 gcc make libc6-dev || exit 1 + CTR_GPU_PREBUILD_CMD="ctr -n k8s.io run --privileged --rm --net-host --with-ns pid:/proc/1/ns/pid --mount type=bind,src=/opt/gpu,dst=/mnt/gpu,options=rbind --mount type=bind,src=/opt/actions,dst=/mnt/actions,options=rbind" + retrycmd_if_failure 3 10 600 bash -c "$CTR_GPU_PREBUILD_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuprebuild /entrypoint.sh build-only" || exit 1 + if [ ! -f /opt/azure/aks-gpu/dkms-marker ]; then + echo "Error: NVIDIA CUDA prebake did not produce /opt/azure/aks-gpu/dkms-marker" + exit 1 + fi + cat << EOF >> ${VHD_LOGS_FILEPATH} + - nvidia-cuda-driver-prebaked=${NVIDIA_DRIVER_IMAGE_TAG} (kernel $(uname -r)) +EOF + fi fi if grep -q "NVIDIA_GB" <<< "$FEATURE_FLAGS"; then From 7697798b19cdf9984864baca11db1ccafbcf533b Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Tue, 30 Jun 2026 11:01:15 -0700 Subject: [PATCH 2/3] fix(gpu): harden stage-1 prebake observability (review feedback) - teardown status=incomplete now also flags a lingering setuid /usr/bin/nvidia-modprobe (not just marker/DKMS state), so the security-coverage signal can't report cleaned while the priv-esc surface remains. (Copilot) - logGPUDriverPrebakeReadiness requires the marker driver_kind AND NVIDIA_GPU_DRIVER_TYPE to both be non-empty before reporting driver_kind_match=true (no empty==empty false positive). (Copilot) shellspec 740/0, generate-testdata clean, shellcheck clean. Signed-off-by: Ganeshkumar Ashokavardhanan --- parts/linux/cloud-init/artifacts/cse_config.sh | 7 +++++-- .../artifacts/ubuntu/cse_install_ubuntu.sh | 13 +++++++------ .../linux/cloud-init/artifacts/cse_config_spec.sh | 11 +++++++++++ .../cloud-init/artifacts/cse_install_ubuntu_spec.sh | 2 ++ 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 238ab7398f1..1f289cda15b 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -1301,10 +1301,13 @@ validateGPUDrivers() { # before enabling consume. Observability only; no behavior change. logGPUDriverPrebakeReadiness() { local marker="${GPU_DKMS_MARKER_FILE:-/opt/azure/aks-gpu/dkms-marker}" - local marker_present=false driver_kind_match=false + local marker_present=false driver_kind_match=false m_kind if [ -f "${marker}" ]; then marker_present=true - if [ "$(sed -n 's/^driver_kind=//p' "${marker}" | head -n1)" = "${NVIDIA_GPU_DRIVER_TYPE}" ]; then + m_kind="$(sed -n 's/^driver_kind=//p' "${marker}" | head -n1)" + # require both sides non-empty so a marker missing driver_kind= (or an unset + # NVIDIA_GPU_DRIVER_TYPE) does not falsely report a match (empty = empty). + if [ -n "${m_kind}" ] && [ -n "${NVIDIA_GPU_DRIVER_TYPE}" ] && [ "${m_kind}" = "${NVIDIA_GPU_DRIVER_TYPE}" ]; then driver_kind_match=true fi fi diff --git a/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh b/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh index 162cf9b012b..606e08db056 100755 --- a/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh +++ b/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh @@ -256,16 +256,17 @@ cleanUpPrebakedGPUDriver() { ldconfig || true rm -f "${marker}" || true - # Stage-1 observability: confirm the prebaked driver is actually gone. status=incomplete means a - # setuid nvidia binary / DKMS registration lingered (a security-coverage gap to investigate); - # status=cleaned counts toward fleet-wide coverage. Greppable prefix AKS_GPU_PREBAKE. - local marker_after=false dkms_after=false status=cleaned + # Stage-1 observability: confirm the prebaked driver is actually gone. status=incomplete means the + # marker, the DKMS registration, or the setuid nvidia-modprobe binary lingered (a security-coverage + # gap to investigate); status=cleaned counts toward fleet-wide coverage. Greppable AKS_GPU_PREBAKE. + local marker_after=false dkms_after=false modprobe_after=false status=cleaned [ -f "${marker}" ] && marker_after=true [ -d /var/lib/dkms/nvidia ] && dkms_after=true - if [ "${marker_after}" = true ] || [ "${dkms_after}" = true ]; then + [ -e /usr/bin/nvidia-modprobe ] && modprobe_after=true + if [ "${marker_after}" = true ] || [ "${dkms_after}" = true ] || [ "${modprobe_after}" = true ]; then status=incomplete fi - echo "AKS_GPU_PREBAKE event=teardown gpu_node=${GPU_NODE:-} status=${status} dkms_before=${dkms_before} marker_after=${marker_after} dkms_after=${dkms_after}" + echo "AKS_GPU_PREBAKE event=teardown gpu_node=${GPU_NODE:-} status=${status} dkms_before=${dkms_before} marker_after=${marker_after} dkms_after=${dkms_after} modprobe_after=${modprobe_after}" } cleanUpGPUDrivers() { diff --git a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh index 8b3cf29ca28..1e2c6f6845e 100755 --- a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh @@ -49,6 +49,17 @@ Describe 'cse_config.sh' The output should include "driver_kind_match=false" rm -f "$marker" End + + It 'does not false-positive when the marker lacks driver_kind and the driver type is unset (both empty)' + marker="$(mktemp)" + printf 'kernel=5.15.0-1114-azure\n' > "$marker" # no driver_kind= line + GPU_DKMS_MARKER_FILE="$marker" + NVIDIA_GPU_DRIVER_TYPE="" + When call logGPUDriverPrebakeReadiness + The output should include "marker_present=true" + The output should include "driver_kind_match=false" + rm -f "$marker" + End End Describe 'configureAzureJson' diff --git a/spec/parts/linux/cloud-init/artifacts/cse_install_ubuntu_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_install_ubuntu_spec.sh index a5e73976607..0581313a786 100644 --- a/spec/parts/linux/cloud-init/artifacts/cse_install_ubuntu_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_install_ubuntu_spec.sh @@ -44,6 +44,8 @@ Describe 'cse_install_ubuntu.sh' The output should include "AKS_GPU_PREBAKE event=teardown" The output should include "status=cleaned" The output should include "marker_after=false" + # the setuid nvidia-modprobe is part of the security-coverage check + The output should include "modprobe_after=false" End End End From 2793ff459a9fb5b658f03469c633057f0fa53982 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Tue, 30 Jun 2026 11:54:23 -0700 Subject: [PATCH 3/3] fix(gpu): keep prebake marker on incomplete teardown + de-flake marker-absent tests - cleanUpPrebakedGPUDriver drops the marker only after a clean teardown; if the DKMS registration or the setuid nvidia-modprobe lingered it KEEPS the marker so the next provision re-runs the cleanup (the marker is the "still needs cleanup" flag). (Copilot) - de-flake the two "marker absent" specs: use a created-then-removed temp path instead of a predictable /tmp/...$$ path that could already exist. (Copilot) shellspec 740/0, generate-testdata clean, shellcheck clean. Signed-off-by: Ganeshkumar Ashokavardhanan --- .../artifacts/ubuntu/cse_install_ubuntu.sh | 16 ++++++++++------ .../cloud-init/artifacts/cse_config_spec.sh | 2 +- .../artifacts/cse_install_ubuntu_spec.sh | 2 +- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh b/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh index 606e08db056..602ea0f89e2 100755 --- a/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh +++ b/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh @@ -254,15 +254,19 @@ cleanUpPrebakedGPUDriver() { done rm -f /etc/ld.so.conf.d/nvidia.conf || true ldconfig || true - rm -f "${marker}" || true - # Stage-1 observability: confirm the prebaked driver is actually gone. status=incomplete means the - # marker, the DKMS registration, or the setuid nvidia-modprobe binary lingered (a security-coverage - # gap to investigate); status=cleaned counts toward fleet-wide coverage. Greppable AKS_GPU_PREBAKE. - local marker_after=false dkms_after=false modprobe_after=false status=cleaned - [ -f "${marker}" ] && marker_after=true + # Stage-1 observability + retry: assess completeness BEFORE dropping the marker. status=incomplete + # means the DKMS registration or the setuid nvidia-modprobe binary lingered (a security-coverage + # alert). On an incomplete teardown we KEEP the marker so the next provision re-runs this cleanup + # (the marker is the "still needs cleanup" flag); on a clean teardown we drop it. status=cleaned + # counts toward fleet-wide coverage. Greppable prefix AKS_GPU_PREBAKE. + local dkms_after=false modprobe_after=false marker_after=true status=cleaned [ -d /var/lib/dkms/nvidia ] && dkms_after=true [ -e /usr/bin/nvidia-modprobe ] && modprobe_after=true + if [ "${dkms_after}" = false ] && [ "${modprobe_after}" = false ]; then + rm -f "${marker}" || true + [ -f "${marker}" ] || marker_after=false + fi if [ "${marker_after}" = true ] || [ "${dkms_after}" = true ] || [ "${modprobe_after}" = true ]; then status=incomplete fi diff --git a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh index 1e2c6f6845e..8bedfbf99d8 100755 --- a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh @@ -20,7 +20,7 @@ Describe 'cse_config.sh' Describe 'logGPUDriverPrebakeReadiness' It 'reports marker_present=false when no prebake marker exists' - GPU_DKMS_MARKER_FILE="/tmp/aks-gpu-readiness-absent-$$" + GPU_DKMS_MARKER_FILE="$(mktemp)"; rm -f "${GPU_DKMS_MARKER_FILE}" NVIDIA_GPU_DRIVER_TYPE="cuda" When call logGPUDriverPrebakeReadiness The output should include "AKS_GPU_PREBAKE event=managed_gpu" diff --git a/spec/parts/linux/cloud-init/artifacts/cse_install_ubuntu_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_install_ubuntu_spec.sh index 0581313a786..9a1569f803d 100644 --- a/spec/parts/linux/cloud-init/artifacts/cse_install_ubuntu_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_install_ubuntu_spec.sh @@ -5,7 +5,7 @@ Describe 'cse_install_ubuntu.sh' Describe 'cleanUpPrebakedGPUDriver' It 'is a no-op when the prebake marker is absent' - GPU_DKMS_MARKER_FILE="/tmp/aks-gpu-marker-absent-$$" + GPU_DKMS_MARKER_FILE="$(mktemp)"; rm -f "${GPU_DKMS_MARKER_FILE}" When call cleanUpPrebakedGPUDriver The status should be success The output should equal ""