From 27d34fe4c6fa6ef2287a3f678a5e853749568792 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Wed, 1 Jul 2026 15:03:39 -0700 Subject: [PATCH 1/2] feat(gpu): use aks-gpu-cuda-lts (R580 LTS) for the managed CUDA driver Switch the managed CUDA GPU driver image from aks-gpu-cuda to the R580 LTS variant aks-gpu-cuda-lts (580.159.04-20260629214430). Why: enabling the CUDA driver prebake (#8786 / #8803) needs an aks-gpu image that supports the `build-only` action (aks-gpu #162). The only build-only-capable aks-gpu-cuda images are on the R595 line, which drops NVIDIA Volta/V100 support -- and ~487 managed-GPU nodes across ~293 subscriptions still run on V100 (NC*_v3, ND40rs_v2). aks-gpu already ships aks-gpu-cuda-lts: the NVIDIA R580 Long Term Support branch (supported through Aug 2028), which keeps V100 AND whose post-#162 builds have build-only, and which also covers every other managed CUDA SKU (T4/A100/H100/H200). So this keeps V100 working and unblocks the prebake with no aks-gpu change. It is a move within the 580 line (580.126.09 -> 580.159.04), not a driver-branch jump. Wiring: - components.json: aks-gpu-cuda -> aks-gpu-cuda-lts (repo, renovateTag, version). - gpu_components.go: LoadConfig case -> aks-gpu-cuda-lts (drives the CUDA driver version/suffix used by both VHD build and runtime install). - baker.go GetGPUDriverType: modern CUDA SKUs -> "cuda-lts" (selects the aks-gpu-cuda-lts image via cse_helpers.sh `aks-gpu-${type}`); legacy NCv1 (K80) stays on "cuda" with its pinned R470 driver. - cse_config.sh logGPUDriverPrebakeReadiness: map the driver-type to the aks-gpu marker's driver_kind (cuda-lts -> cuda, grid-v20 -> grid) so a CUDA-prebaked marker matches a cuda-lts node. - install-dependencies.sh: VHD-build prebake/caching image selection -> aks-gpu-cuda-lts. - ACL/Mariner comments updated; their grid-vs-non-grid sysext logic already handles "cuda-lts". ACL/AzureLinux install drivers from OS sysext images (not the aks-gpu container), so this change is effectively Ubuntu-scoped. Validation: go test, make generate-testdata (no drift), shellcheck, shellspec (751/0), make validate-components all pass. Supersedes #8810 (which bumped aks-gpu-cuda to R595 and would have dropped V100). Signed-off-by: Ganeshkumar Ashokavardhanan --- parts/common/components.json | 6 +++--- .../cloud-init/artifacts/acl/cse_install_acl.sh | 3 ++- parts/linux/cloud-init/artifacts/cse_config.sh | 11 +++++++++-- .../artifacts/mariner/cse_install_mariner.sh | 3 ++- pkg/agent/baker.go | 11 ++++++++++- pkg/agent/baker_test.go | 13 ++++++++----- pkg/agent/datamodel/gpu_components.go | 2 +- pkg/agent/datamodel/gpu_components_test.go | 2 +- .../linux/cloud-init/artifacts/cse_config_spec.sh | 12 ++++++++++++ vhdbuilder/packer/install-dependencies.sh | 4 ++-- 10 files changed, 50 insertions(+), 17 deletions(-) diff --git a/parts/common/components.json b/parts/common/components.json index c98fbe9d701..e6cd79b0fb5 100644 --- a/parts/common/components.json +++ b/parts/common/components.json @@ -736,10 +736,10 @@ ], "GPUContainerImages": [ { - "downloadURL": "mcr.microsoft.com/aks/aks-gpu-cuda:*", + "downloadURL": "mcr.microsoft.com/aks/aks-gpu-cuda-lts:*", "gpuVersion": { - "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-cuda", - "latestVersion": "580.126.09-20260126030251" + "renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-cuda-lts", + "latestVersion": "580.159.04-20260629214430" } }, { diff --git a/parts/linux/cloud-init/artifacts/acl/cse_install_acl.sh b/parts/linux/cloud-init/artifacts/acl/cse_install_acl.sh index f45694b8df0..07d128156c3 100644 --- a/parts/linux/cloud-init/artifacts/acl/cse_install_acl.sh +++ b/parts/linux/cloud-init/artifacts/acl/cse_install_acl.sh @@ -175,7 +175,8 @@ installGPUDriverSysext() { # # NVIDIA_GPU_DRIVER_TYPE is set by AgentBaker based on the GPU SKU maps in # gpu_components.go. Converged sizes get "grid"; RTX PRO 6000 BSE v6 gets - # "grid-v20" (Ubuntu-only, rejected below); all others get "cuda". + # "grid-v20" (Ubuntu-only, rejected below); modern CUDA SKUs get "cuda-lts" and legacy + # NCv1 gets "cuda". Only grid vs non-grid matters here, so both take the CUDA path below. # Legacy GPUs (T4, V100) require proprietary CUDA drivers; A100+ use NVIDIA open drivers. local vm_sku vm_sku=$(get_compute_sku) diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 1f289cda15b..f77dc1a1843 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -1301,13 +1301,20 @@ validateGPUDrivers() { # before enabling consume. Observability only; no behavior change. logGPUDriverPrebakeReadiness() { local marker="${GPU_DKMS_MARKER_FILE:-/opt/azure/aks-gpu/dkms-marker}" - local marker_present=false driver_kind_match=false m_kind + local marker_present=false driver_kind_match=false m_kind node_kind + # Map the AgentBaker driver-type to the aks-gpu marker's driver_kind (the container's DRIVER_KIND + # build arg): image variants "cuda-lts" and "grid-v20" bake markers as "cuda"/"grid" respectively. + case "${NVIDIA_GPU_DRIVER_TYPE}" in + cuda*) node_kind=cuda ;; + grid*) node_kind=grid ;; + *) node_kind="${NVIDIA_GPU_DRIVER_TYPE}" ;; + esac if [ -f "${marker}" ]; then marker_present=true m_kind="$(sed -n 's/^driver_kind=//p' "${marker}" | head -n1)" # require both sides non-empty so a marker missing driver_kind= (or an unset # NVIDIA_GPU_DRIVER_TYPE) does not falsely report a match (empty = empty). - if [ -n "${m_kind}" ] && [ -n "${NVIDIA_GPU_DRIVER_TYPE}" ] && [ "${m_kind}" = "${NVIDIA_GPU_DRIVER_TYPE}" ]; then + if [ -n "${m_kind}" ] && [ -n "${node_kind}" ] && [ "${m_kind}" = "${node_kind}" ]; then driver_kind_match=true fi fi diff --git a/parts/linux/cloud-init/artifacts/mariner/cse_install_mariner.sh b/parts/linux/cloud-init/artifacts/mariner/cse_install_mariner.sh index 7e5c8a0e7f0..5f316f93324 100755 --- a/parts/linux/cloud-init/artifacts/mariner/cse_install_mariner.sh +++ b/parts/linux/cloud-init/artifacts/mariner/cse_install_mariner.sh @@ -115,7 +115,8 @@ downloadGPUDrivers() { # # NVIDIA_GPU_DRIVER_TYPE is set by AgentBaker based on the GPU SKU maps in # gpu_components.go. Converged sizes get "grid"; RTX PRO 6000 BSE v6 gets - # "grid-v20" (Ubuntu-only, rejected below); all others get "cuda". + # "grid-v20" (Ubuntu-only, rejected below); modern CUDA SKUs get "cuda-lts" and legacy + # NCv1 gets "cuda". Only grid vs non-grid matters here, so both take the CUDA path below. # Legacy GPUs (T4, V100) require proprietary CUDA drivers; A100+ use NVIDIA open drivers. KERNEL_VERSION=$(uname -r | sed 's/-/./g') VM_SKU=$(get_compute_sku) diff --git a/pkg/agent/baker.go b/pkg/agent/baker.go index bdc69f59aed..2e8b4d6e101 100644 --- a/pkg/agent/baker.go +++ b/pkg/agent/baker.go @@ -1525,6 +1525,12 @@ func GetAKSGPUImageSHA(size string) string { return datamodel.AKSGPUCudaVersionSuffix } +// GetGPUDriverType maps a GPU VM size to the aks-gpu image variant used to install its driver. +// The value becomes NVIDIA_GPU_DRIVER_TYPE at provision time, which selects the container image +// mcr.microsoft.com/aks/aks-gpu-. Modern CUDA compute SKUs (T4, V100, A100, H100, H200, ...) +// use the R580 LTS image (aks-gpu-cuda-lts): it retains Volta/V100 support that the newer aks-gpu-cuda +// R595 line drops, is supported through Aug 2028, and is the branch the VHD driver prebake is built +// against. Legacy NCv1 (K80) keeps the separate "cuda" path with its pinned R470 driver. func GetGPUDriverType(size string) string { if useGridV20Drivers(size) { return "grid-v20" @@ -1532,7 +1538,10 @@ func GetGPUDriverType(size string) string { if useGridDrivers(size) { return "grid" } - return "cuda" + if isStandardNCv1(size) { + return "cuda" + } + return "cuda-lts" } func GPUNeedsFabricManager(size string) bool { diff --git a/pkg/agent/baker_test.go b/pkg/agent/baker_test.go index c167bafd10c..5478db71632 100644 --- a/pkg/agent/baker_test.go +++ b/pkg/agent/baker_test.go @@ -964,8 +964,11 @@ var _ = Describe("GetGPUDriverVersion", func() { var _ = Describe("GetGPUDriverType", func() { - It("should use cuda with nc v3", func() { - Expect(GetGPUDriverType("standard_nc6_v3")).To(Equal("cuda")) + It("should use cuda-lts with nc v3", func() { + Expect(GetGPUDriverType("standard_nc6_v3")).To(Equal("cuda-lts")) + }) + It("should keep cuda (legacy R470) with nc v1 (K80)", func() { + Expect(GetGPUDriverType("standard_nc6")).To(Equal("cuda")) }) It("should use grid with nv v5", func() { Expect(GetGPUDriverType("standard_nv6ads_a10_v5")).To(Equal("grid")) @@ -979,8 +982,8 @@ var _ = Describe("GetGPUDriverType", func() { Expect(GetGPUDriverType("Standard_NC320lds_xl_RTXPRO6000BSE_v6")).To(Equal("grid-v20")) }) // NV V1 SKUs were retired in September 2023, leaving this test just for safety - It("should use cuda with nv v1", func() { - Expect(GetGPUDriverType("standard_nv6")).To(Equal("cuda")) + It("should use cuda-lts with nv v1", func() { + Expect(GetGPUDriverType("standard_nv6")).To(Equal("cuda-lts")) }) }) @@ -1118,7 +1121,7 @@ var _ = Describe("getLinuxNodeCSECommand", func() { vars := decodeCSEVars(cseCmd) Expect(vars).To(HaveKeyWithValue("GPU_NODE", "true")) Expect(vars).To(HaveKeyWithValue("CONFIG_GPU_DRIVER_IF_NEEDED", "true")) - Expect(vars).To(HaveKeyWithValue("GPU_DRIVER_TYPE", "cuda")) + Expect(vars).To(HaveKeyWithValue("GPU_DRIVER_TYPE", "cuda-lts")) }) It("should handle custom cloud environment", func() { diff --git a/pkg/agent/datamodel/gpu_components.go b/pkg/agent/datamodel/gpu_components.go index 2c432bbede0..e2c8b33090d 100644 --- a/pkg/agent/datamodel/gpu_components.go +++ b/pkg/agent/datamodel/gpu_components.go @@ -62,7 +62,7 @@ func LoadConfig() error { // repos sharing a prefix (e.g. "aks-gpu-grid" vs "aks-gpu-grid-v20") are not // confused by substring matching. switch gpuImageRepo(image.DownloadURL) { - case "aks-gpu-cuda": + case "aks-gpu-cuda-lts": NvidiaCudaDriverVersion = version AKSGPUCudaVersionSuffix = suffix case "aks-gpu-grid": diff --git a/pkg/agent/datamodel/gpu_components_test.go b/pkg/agent/datamodel/gpu_components_test.go index 12d6dfb35c1..86e55937f73 100644 --- a/pkg/agent/datamodel/gpu_components_test.go +++ b/pkg/agent/datamodel/gpu_components_test.go @@ -71,7 +71,7 @@ func TestLoadConfig(t *testing.T) { // LoadConfig switch that maps each repo to its own driver version/suffix. func TestGPUImageRepo(t *testing.T) { cases := map[string]string{ - "mcr.microsoft.com/aks/aks-gpu-cuda:*": "aks-gpu-cuda", + "mcr.microsoft.com/aks/aks-gpu-cuda-lts:*": "aks-gpu-cuda-lts", "mcr.microsoft.com/aks/aks-gpu-grid:*": "aks-gpu-grid", "mcr.microsoft.com/aks/aks-gpu-grid-v20:*": "aks-gpu-grid-v20", "mcr.microsoft.com/aks/aks-gpu-grid-v20:595.58.03-1": "aks-gpu-grid-v20", diff --git a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh index 8bedfbf99d8..c671b911f92 100755 --- a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh @@ -39,6 +39,18 @@ Describe 'cse_config.sh' rm -f "$marker" End + It 'matches a cuda marker for a cuda-lts (R580 LTS) node: driver-type maps to the aks-gpu driver_kind' + marker="$(mktemp)" + printf 'driver_kind=cuda\n' > "$marker" + GPU_DKMS_MARKER_FILE="$marker" + NVIDIA_GPU_DRIVER_TYPE="cuda-lts" + When call logGPUDriverPrebakeReadiness + The output should include "marker_present=true" + The output should include "driver_kind_match=true" + The output should include "driver_type=cuda-lts" + rm -f "$marker" + End + It 'reports driver_kind_match=false when a CUDA marker is on a GRID node (not skip-ready)' marker="$(mktemp)" printf 'driver_kind=cuda\n' > "$marker" diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 0139d00a16d..046a239b32b 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -716,7 +716,7 @@ if [ $OS = $UBUNTU_OS_NAME ] && [ "$(isARM64)" -ne 1 ]; then # No ARM64 SKU wit # shellcheck disable=SC2001 imageName=$(echo "$downloadURL" | sed 's/:.*$//') - if [ "$imageName" = "mcr.microsoft.com/aks/aks-gpu-cuda" ]; then + if [ "$imageName" = "mcr.microsoft.com/aks/aks-gpu-cuda-lts" ]; then latestVersion=$(echo "${imageToBePulled}" | jq -r '.gpuVersion.latestVersion') NVIDIA_DRIVER_IMAGE="$imageName" NVIDIA_DRIVER_IMAGE_TAG="$latestVersion" @@ -726,7 +726,7 @@ if [ $OS = $UBUNTU_OS_NAME ] && [ "$(isARM64)" -ne 1 ]; then # No ARM64 SKU wit # Check if the NVIDIA_DRIVER_IMAGE and NVIDIA_DRIVER_IMAGE_TAG were found if [ -z "$NVIDIA_DRIVER_IMAGE" ] || [ -z "$NVIDIA_DRIVER_IMAGE_TAG" ]; then - echo "Error: Unable to find aks-gpu-cuda image in components.json" + echo "Error: Unable to find aks-gpu-cuda-lts image in components.json" exit 1 fi From 7b47e667bc4d62a1b14cd6807c047a9590e91280 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Wed, 1 Jul 2026 15:17:51 -0700 Subject: [PATCH 2/2] fix(gpu): point the renovate rule at aks-gpu-cuda-lts The custom versioning rule that lets Renovate parse the driver image's "-" tag matched "aks/aks-gpu-cuda"; after moving the managed CUDA driver to aks-gpu-cuda-lts, retarget the rule (and groupName) so the LTS repo is version-tracked. Also flip automerge to false: this is now the V100-critical managed driver, so driver bumps should be reviewed (matching the aks-gpu-grid / grid-v20 rules) rather than auto-merged. Signed-off-by: Ganeshkumar Ashokavardhanan --- .github/renovate.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/renovate.json b/.github/renovate.json index 4e70e191edb..a877abf4655 100644 --- a/.github/renovate.json +++ b/.github/renovate.json @@ -572,11 +572,11 @@ }, { "matchPackageNames": [ - "aks/aks-gpu-cuda" + "aks/aks-gpu-cuda-lts" ], - "groupName": "nvidia-gpu-cuda", + "groupName": "nvidia-gpu-cuda-lts", "versioning": "regex:^(?\\d+)\\.(?\\d+)\\.(?\\d+)-(?\\d{14})$", - "automerge": true, + "automerge": false, "enabled": true, "ignoreUnstable": false },