Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions parts/linux/cloud-init/artifacts/cse_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1213,20 +1213,35 @@ pullGPUDriverImage() {
}

installGPUDriverImage() {
retrycmd_if_failure 5 10 600 bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh install"
local gpuInstallAction="${1:-install}"
retrycmd_if_failure 5 10 600 bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh ${gpuInstallAction}"
}

configGPUDrivers() {
if [ "$OS" = "$UBUNTU_OS_NAME" ]; then
waitForContainerdReady || exit $ERR_GPU_DRIVERS_START_FAIL
mkdir -p /opt/{actions,gpu}
# When the kernel module was pre-built into the VHD (build-only at image-bake time),
# a marker is present. Ask aks-gpu to skip the ~100s DKMS recompile and run only the
# device-dependent steps -- but ONLY when the marker's driver_kind matches THIS node's
# driver (NVIDIA_GPU_DRIVER_TYPE). A CUDA-prebaked marker on a GRID node (or vice-versa)
# must request a full "install": the other driver image may not even support
# install-skip-build and would fail to stage its userspace files (e.g. /opt/gpu/config.sh).
# aks-gpu still independently re-validates the marker (kernel + driver_version +
# driver_kind) and falls back to a full build on any remaining mismatch (e.g. kernel drift).
local GPU_INSTALL_ACTION="install"
local GPU_DKMS_MARKER="${GPU_DKMS_MARKER_FILE:-/opt/azure/aks-gpu/dkms-marker}"
if [ -f "$GPU_DKMS_MARKER" ] && \
[ "$(sed -n 's/^driver_kind=//p' "$GPU_DKMS_MARKER" | head -n1)" = "$NVIDIA_GPU_DRIVER_TYPE" ]; then
GPU_INSTALL_ACTION="install-skip-build"
fi
# The driver image is normally pre-pulled into the VHD; only hit the registry when it is
# actually missing so provisioning doesn't pay a redundant manifest/layer round trip.
# Use containerd's native exact-name filter rather than text-matching `images ls` output.
if [ -z "$(ctr -n k8s.io images ls -q "name==${NVIDIA_DRIVER_IMAGE}:${NVIDIA_DRIVER_IMAGE_TAG}")" ]; then
logs_to_events "AKS.CSE.configGPUDrivers.pullGPUDriverImage" pullGPUDriverImage
fi
logs_to_events "AKS.CSE.configGPUDrivers.installGPUDriverImage" installGPUDriverImage
logs_to_events "AKS.CSE.configGPUDrivers.installGPUDriverImage" installGPUDriverImage "$GPU_INSTALL_ACTION"
ret=$?
if [ "$ret" -ne 0 ]; then
echo "Failed to install GPU driver, exiting..."
Expand Down
51 changes: 51 additions & 0 deletions spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1888,6 +1888,57 @@ SETUP_EOF
End
End

Describe 'configGPUDrivers prebake marker action selection'
# Mock everything the Ubuntu path touches so the test exercises only the
# marker -> aks-gpu action selection (install vs install-skip-build), including the
# driver_kind guard (a CUDA-baked marker on a GRID node must NOT skip the build).
# logs_to_events is mocked to faithfully dispatch the wrapped command (dropping the
# event-name arg) so the real installGPUDriverImage runs and surfaces the action.
logs_to_events() { shift; $@; }
waitForContainerdReady() { return 0; }
mkdir() { :; }
ctr() { echo "ctr $*"; }
nvidia-modprobe() { return 0; }
nvidia-smi() { return 0; }
ldconfig() { return 0; }
isMarinerOrAzureLinux() { return 1; }
isAzureLinuxOSGuard() { return 1; }
isACL() { return 1; }
systemctlEnableAndStart() { return 0; }
systemctl() { return 0; }
# Capture the action passed to the install container.
retrycmd_if_failure() { shift 3; echo "INSTALL_CMD: $*"; return 0; }

BeforeEach 'OS="$UBUNTU_OS_NAME"; NVIDIA_GPU_DRIVER_TYPE="cuda"; NVIDIA_DRIVER_IMAGE="mcr.microsoft.com/aks/aks-gpu-cuda"; NVIDIA_DRIVER_IMAGE_TAG="580.0.0"; CTR_GPU_INSTALL_CMD="ctr-run"; GPU_DKMS_MARKER_FILE="$(mktemp)"; rm -f "$GPU_DKMS_MARKER_FILE"'

It 'uses the full install action when no prebake marker is present'
When call configGPUDrivers
The output should include "/entrypoint.sh install"
The output should not include "install-skip-build"
End

It 'uses install-skip-build when the prebake marker matches the node driver kind'
marker="$(mktemp)"
printf 'driver_kind=cuda\n' > "$marker"
GPU_DKMS_MARKER_FILE="$marker"
When call configGPUDrivers
The output should include "/entrypoint.sh install-skip-build"
rm -f "$marker"
End

It 'falls back to full install when the marker driver_kind does not match the node (CUDA marker on GRID node)'
marker="$(mktemp)"
printf 'driver_kind=cuda\n' > "$marker"
GPU_DKMS_MARKER_FILE="$marker"
NVIDIA_GPU_DRIVER_TYPE="grid"
NVIDIA_DRIVER_IMAGE="mcr.microsoft.com/aks/aks-gpu-grid"
When call configGPUDrivers
The output should include "/entrypoint.sh install"
The output should not include "install-skip-build"
rm -f "$marker"
End
End

Describe 'configureManagedGPUExperience'
# Mock the helper functions
logs_to_events() {
Expand Down
Loading