Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions parts/linux/cloud-init/artifacts/cse_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1295,6 +1295,25 @@ validateGPUDrivers() {
fi
}

# logGPUDriverPrebakeReadiness emits a stage-1 observability signal on a managed GPU node: whether
# the aks-gpu prebake marker is present and matches this node's driver kind -- i.e. whether stage-2
# (skip-build) would take the fast path. Lets the rollout confirm managed CUDA GPU nodes are ready
# before enabling consume. Observability only; no behavior change.
logGPUDriverPrebakeReadiness() {
local marker="${GPU_DKMS_MARKER_FILE:-/opt/azure/aks-gpu/dkms-marker}"
local marker_present=false driver_kind_match=false m_kind
if [ -f "${marker}" ]; then
marker_present=true
m_kind="$(sed -n 's/^driver_kind=//p' "${marker}" | head -n1)"
Comment thread
ganeshkumarashok marked this conversation as resolved.
# require both sides non-empty so a marker missing driver_kind= (or an unset
# NVIDIA_GPU_DRIVER_TYPE) does not falsely report a match (empty = empty).
if [ -n "${m_kind}" ] && [ -n "${NVIDIA_GPU_DRIVER_TYPE}" ] && [ "${m_kind}" = "${NVIDIA_GPU_DRIVER_TYPE}" ]; then
driver_kind_match=true
fi
fi
echo "AKS_GPU_PREBAKE event=managed_gpu driver_type=${NVIDIA_GPU_DRIVER_TYPE:-} marker_present=${marker_present} driver_kind_match=${driver_kind_match}"
}

ensureGPUDrivers() {
if [ "$(isARM64)" -eq 1 ]; then
return
Expand All @@ -1307,6 +1326,7 @@ ensureGPUDrivers() {
fi
if [ "$OS" = "$UBUNTU_OS_NAME" ]; then
logs_to_events "AKS.CSE.ensureGPUDrivers.nvidia-modprobe" "systemctlEnableAndStart nvidia-modprobe 30" || exit $ERR_GPU_DRIVERS_START_FAIL
logGPUDriverPrebakeReadiness
fi
}

Expand Down
56 changes: 56 additions & 0 deletions parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -223,12 +223,68 @@ removeNvidiaRepos() {
fi
}

# cleanUpPrebakedGPUDriver removes a CUDA driver pre-baked into the shared VHD on any node that does
# NOT install the AKS-managed driver -- the cleanUpGPUDrivers path (GPU_NODE != true OR
# skip_nvidia_driver_install=true): non-GPU VMs, and GPU VMs opted out via --gpu-driver None or the
# skip toggle/tag. There the driver is dead weight (wasted disk; nvidia.ko rebuilt on every kernel
# patch) and, on an opted-out GPU node, unused attack surface. The module is never loaded on these
# nodes (ensureGPUDrivers doesn't run), so deregistration is safe. No-op unless the marker exists.
cleanUpPrebakedGPUDriver() {
local marker="${GPU_DKMS_MARKER_FILE:-/opt/azure/aks-gpu/dkms-marker}"
if [ ! -f "${marker}" ]; then
return 0
fi
echo "Removing pre-baked NVIDIA driver inherited from shared VHD (node does not install the managed driver)"
Comment thread
ganeshkumarashok marked this conversation as resolved.
local dkms_before=false
[ -d /var/lib/dkms/nvidia ] && dkms_before=true

# Deregister the nvidia DKMS module by removing its source tree (avoids the slow `dkms remove
# --all`, ~35s). The module isn't loaded here, so no depmod/initramfs refresh is needed.
rm -rf /var/lib/dkms/nvidia || true
rm -f /lib/modules/*/updates/dkms/nvidia*.ko* 2>/dev/null || true
# The prebake stages libs under the aks-gpu *container's* GPU_DEST=/usr/bin (aks-gpu config.sh),
# NOT this script's GPU_DEST=/usr/local/nvidia -- so clear /usr/bin.
rm -rf /usr/bin/lib64 || true
# Remove the driver binaries too (same /usr/bin) so the node is genuinely driver-free -- else
# e.g. nvidia-smi stays on PATH but errors once its libs are gone.
for nvidiaBin in nvidia-smi nvidia-debugdump nvidia-persistenced nvidia-cuda-mps-control \
nvidia-cuda-mps-server nvidia-modprobe nvidia-bug-report.sh nvidia-powerd \
nvidia-ngx-updater nvidia-sleep.sh; do
rm -f "/usr/bin/${nvidiaBin}" || true
done
rm -f /etc/ld.so.conf.d/nvidia.conf || true
ldconfig || true

# Stage-1 observability + retry: assess completeness BEFORE dropping the marker. status=incomplete
# means the DKMS registration or the setuid nvidia-modprobe binary lingered (a security-coverage
# alert). On an incomplete teardown we KEEP the marker so the next provision re-runs this cleanup
# (the marker is the "still needs cleanup" flag); on a clean teardown we drop it. status=cleaned
# counts toward fleet-wide coverage. Greppable prefix AKS_GPU_PREBAKE.
local dkms_after=false modprobe_after=false marker_after=true status=cleaned
[ -d /var/lib/dkms/nvidia ] && dkms_after=true
[ -e /usr/bin/nvidia-modprobe ] && modprobe_after=true
if [ "${dkms_after}" = false ] && [ "${modprobe_after}" = false ]; then
rm -f "${marker}" || true
[ -f "${marker}" ] || marker_after=false
fi
if [ "${marker_after}" = true ] || [ "${dkms_after}" = true ] || [ "${modprobe_after}" = true ]; then
status=incomplete
fi
echo "AKS_GPU_PREBAKE event=teardown gpu_node=${GPU_NODE:-} status=${status} dkms_before=${dkms_before} marker_after=${marker_after} dkms_after=${dkms_after} modprobe_after=${modprobe_after}"
}

cleanUpGPUDrivers() {
rm -Rf $GPU_DEST /opt/gpu

for packageName in $(managedGPUPackageList); do
rm -rf "/opt/${packageName}"
done

# A CUDA driver pre-baked into a shared Ubuntu VHD is dead weight on a node that doesn't install
# the managed driver (non-GPU, or GPU opted out via --gpu-driver None / skip), and while
# DKMS-registered it forces an nvidia.ko rebuild on every kernel patch. Tear it down here.
# No-op on VHDs without the aks-gpu prebake marker.
cleanUpPrebakedGPUDriver
Comment thread
ganeshkumarashok marked this conversation as resolved.
}

installCriCtlPackage() {
Expand Down
44 changes: 44 additions & 0 deletions spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,50 @@ Describe 'cse_config.sh'
Include "./parts/linux/cloud-init/artifacts/cse_config.sh"
Include "./parts/linux/cloud-init/artifacts/cse_helpers.sh"

Describe 'logGPUDriverPrebakeReadiness'
It 'reports marker_present=false when no prebake marker exists'
GPU_DKMS_MARKER_FILE="$(mktemp)"; rm -f "${GPU_DKMS_MARKER_FILE}"
NVIDIA_GPU_DRIVER_TYPE="cuda"
When call logGPUDriverPrebakeReadiness
The output should include "AKS_GPU_PREBAKE event=managed_gpu"
The output should include "marker_present=false"
The output should include "driver_kind_match=false"
End

It 'reports marker_present=true driver_kind_match=true when the marker matches the node driver kind'
marker="$(mktemp)"
printf 'driver_kind=cuda\n' > "$marker"
GPU_DKMS_MARKER_FILE="$marker"
NVIDIA_GPU_DRIVER_TYPE="cuda"
When call logGPUDriverPrebakeReadiness
The output should include "marker_present=true"
The output should include "driver_kind_match=true"
rm -f "$marker"
End

It 'reports driver_kind_match=false when a CUDA marker is on a GRID node (not skip-ready)'
marker="$(mktemp)"
printf 'driver_kind=cuda\n' > "$marker"
GPU_DKMS_MARKER_FILE="$marker"
NVIDIA_GPU_DRIVER_TYPE="grid"
When call logGPUDriverPrebakeReadiness
The output should include "marker_present=true"
The output should include "driver_kind_match=false"
rm -f "$marker"
End

It 'does not false-positive when the marker lacks driver_kind and the driver type is unset (both empty)'
marker="$(mktemp)"
printf 'kernel=5.15.0-1114-azure\n' > "$marker" # no driver_kind= line
GPU_DKMS_MARKER_FILE="$marker"
NVIDIA_GPU_DRIVER_TYPE=""
When call logGPUDriverPrebakeReadiness
The output should include "marker_present=true"
The output should include "driver_kind_match=false"
rm -f "$marker"
End
End

Describe 'configureAzureJson'
AZURE_JSON_PATH="azure.json"
AKS_CUSTOM_CLOUD_JSON_PATH="customcloud.json"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash

Describe 'cse_install_ubuntu.sh'
Include "./parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh"

Describe 'cleanUpPrebakedGPUDriver'
It 'is a no-op when the prebake marker is absent'
GPU_DKMS_MARKER_FILE="$(mktemp)"; rm -f "${GPU_DKMS_MARKER_FILE}"
When call cleanUpPrebakedGPUDriver
The status should be success
The output should equal ""
End

It 'deregisters the nvidia DKMS module and removes baked artifacts (libs, binaries, marker) when present'
marker="$(mktemp)"
GPU_DKMS_MARKER_FILE="${marker}"
rm() { echo "mock rm $*"; }
ldconfig() { echo "mock ldconfig"; }
When call cleanUpPrebakedGPUDriver
The status should be success
The output should include "Removing pre-baked NVIDIA driver"
# deregisters via the DKMS source tree + built module removal (no slow dkms remove)
The output should include "mock rm -rf /var/lib/dkms/nvidia"
The output should include "mock rm -f /lib/modules"
# relocated userspace libs
The output should include "mock rm -rf /usr/bin/lib64"
# driver userspace binaries so nvidia-smi becomes "command not found" on non-GPU nodes
The output should include "mock rm -f /usr/bin/nvidia-smi"
The output should include "mock ldconfig"
# the slow per-version dkms remove --all must NOT be on the critical path anymore
The output should not include "dkms remove"
# stage-1 observability: a structured outcome line is emitted. Here rm is mocked, so the
# marker is left in place and the cleanup correctly reports an incomplete (security-gap) result.
The output should include "AKS_GPU_PREBAKE event=teardown"
The output should include "status=incomplete"
End

It 'reports status=cleaned once the marker and DKMS state are actually gone'
marker="$(mktemp)"
GPU_DKMS_MARKER_FILE="${marker}"
ldconfig() { echo "mock ldconfig"; }
When call cleanUpPrebakedGPUDriver
The status should be success
The output should include "AKS_GPU_PREBAKE event=teardown"
The output should include "status=cleaned"
The output should include "marker_after=false"
# the setuid nvidia-modprobe is part of the security-coverage check
The output should include "modprobe_after=false"
End
End
End
27 changes: 27 additions & 0 deletions vhdbuilder/packer/install-dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -737,6 +737,33 @@ if [ $OS = $UBUNTU_OS_NAME ] && [ "$(isARM64)" -ne 1 ]; then # No ARM64 SKU wit
cat << EOF >> ${VHD_LOGS_FILEPATH}
- nvidia-cuda-driver=${NVIDIA_DRIVER_IMAGE_TAG}
EOF

# Opt-in: pre-build the NVIDIA kernel module into the VHD so node provisioning skips the
# ~100s in-CSE DKMS compile. The aks-gpu container is run in "build-only" mode: it compiles
# and DKMS-registers the kernel module + stages userspace libs against THIS VHD's kernel,
# performs NO device access (safe on the GPU-less Packer builder), and writes the marker
# /opt/azure/aks-gpu/dkms-marker. At node boot, configGPUDrivers passes "install-skip-build"
# when that marker matches, running only the device-dependent steps.
# The driver image is intentionally LEFT in the VHD: boot-time device init still sources the
# container toolkit debs, fabric manager, containerd runtime config and udev rules from it.
# Dropping the image is a separate, deferred size optimization.
if grep -q "NVIDIA_CUDA_PREBAKE" <<< "$FEATURE_FLAGS"; then
Comment thread
ganeshkumarashok marked this conversation as resolved.
echo "Pre-building NVIDIA CUDA kernel module into the VHD (build-only) for kernel $(uname -r)"
# nvidia-installer needs gcc/make + libc6-dev to compile; the builder lacks them here, so install
# them. A boot-time fallback recompile (marker mismatch) still has them: the VHD ships
# build-essential -> libc6-dev (release-notes manifests) -- the toolchain baseline GPU nodes compile
# with at boot (installDeps runs apt --no-install-recommends, so gcc alone doesn't pull libc6-dev).
apt_get_install 10 2 300 gcc make libc6-dev || exit 1
CTR_GPU_PREBUILD_CMD="ctr -n k8s.io run --privileged --rm --net-host --with-ns pid:/proc/1/ns/pid --mount type=bind,src=/opt/gpu,dst=/mnt/gpu,options=rbind --mount type=bind,src=/opt/actions,dst=/mnt/actions,options=rbind"
retrycmd_if_failure 3 10 600 bash -c "$CTR_GPU_PREBUILD_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuprebuild /entrypoint.sh build-only" || exit 1
Comment thread
ganeshkumarashok marked this conversation as resolved.
if [ ! -f /opt/azure/aks-gpu/dkms-marker ]; then
echo "Error: NVIDIA CUDA prebake did not produce /opt/azure/aks-gpu/dkms-marker"
exit 1
fi
cat << EOF >> ${VHD_LOGS_FILEPATH}
- nvidia-cuda-driver-prebaked=${NVIDIA_DRIVER_IMAGE_TAG} (kernel $(uname -r))
EOF
fi
fi

if grep -q "NVIDIA_GB" <<< "$FEATURE_FLAGS"; then
Expand Down
Loading