diff --git a/parts/linux/cloud-init/artifacts/acl/cse_install_acl.sh b/parts/linux/cloud-init/artifacts/acl/cse_install_acl.sh index 07d128156c3..82d27a32e9b 100644 --- a/parts/linux/cloud-init/artifacts/acl/cse_install_acl.sh +++ b/parts/linux/cloud-init/artifacts/acl/cse_install_acl.sh @@ -175,8 +175,9 @@ installGPUDriverSysext() { # # NVIDIA_GPU_DRIVER_TYPE is set by AgentBaker based on the GPU SKU maps in # gpu_components.go. Converged sizes get "grid"; RTX PRO 6000 BSE v6 gets - # "grid-v20" (Ubuntu-only, rejected below); modern CUDA SKUs get "cuda-lts" and legacy - # NCv1 gets "cuda". Only grid vs non-grid matters here, so both take the CUDA path below. + # "grid-v20" (Ubuntu-only, rejected below); all other CUDA SKUs (including legacy + # NCv1/K80) get "cuda-lts". Only grid vs non-grid matters here, so the CUDA path + # below handles every non-grid SKU. # Legacy GPUs (T4, V100) require proprietary CUDA drivers; A100+ use NVIDIA open drivers. local vm_sku vm_sku=$(get_compute_sku) diff --git a/parts/linux/cloud-init/artifacts/mariner/cse_install_mariner.sh b/parts/linux/cloud-init/artifacts/mariner/cse_install_mariner.sh index 5f316f93324..d7caecddae7 100755 --- a/parts/linux/cloud-init/artifacts/mariner/cse_install_mariner.sh +++ b/parts/linux/cloud-init/artifacts/mariner/cse_install_mariner.sh @@ -115,8 +115,9 @@ downloadGPUDrivers() { # # NVIDIA_GPU_DRIVER_TYPE is set by AgentBaker based on the GPU SKU maps in # gpu_components.go. Converged sizes get "grid"; RTX PRO 6000 BSE v6 gets - # "grid-v20" (Ubuntu-only, rejected below); modern CUDA SKUs get "cuda-lts" and legacy - # NCv1 gets "cuda". Only grid vs non-grid matters here, so both take the CUDA path below. + # "grid-v20" (Ubuntu-only, rejected below); all other CUDA SKUs (including legacy + # NCv1/K80) get "cuda-lts". Only grid vs non-grid matters here, so the CUDA path + # below handles every non-grid SKU. # Legacy GPUs (T4, V100) require proprietary CUDA drivers; A100+ use NVIDIA open drivers. KERNEL_VERSION=$(uname -r | sed 's/-/./g') VM_SKU=$(get_compute_sku) diff --git a/pkg/agent/baker.go b/pkg/agent/baker.go index 2e8b4d6e101..4177e9d3ce6 100644 --- a/pkg/agent/baker.go +++ b/pkg/agent/baker.go @@ -1494,17 +1494,9 @@ func GetGPUDriverVersion(size string) string { if useGridDrivers(size) { return datamodel.NvidiaGridDriverVersion } - if isStandardNCv1(size) { - return datamodel.Nvidia470CudaDriverVersion - } return datamodel.NvidiaCudaDriverVersion } -func isStandardNCv1(size string) bool { - tmp := strings.ToLower(size) - return strings.HasPrefix(tmp, "standard_nc") && !strings.Contains(tmp, "_v") -} - func useGridDrivers(size string) bool { return datamodel.ConvergedGPUDriverSizes[strings.ToLower(size)] } @@ -1527,10 +1519,15 @@ func GetAKSGPUImageSHA(size string) string { // GetGPUDriverType maps a GPU VM size to the aks-gpu image variant used to install its driver. // The value becomes NVIDIA_GPU_DRIVER_TYPE at provision time, which selects the container image -// mcr.microsoft.com/aks/aks-gpu-. Modern CUDA compute SKUs (T4, V100, A100, H100, H200, ...) +// mcr.microsoft.com/aks/aks-gpu-. All CUDA compute SKUs (T4, V100, A100, H100, H200, ...) // use the R580 LTS image (aks-gpu-cuda-lts): it retains Volta/V100 support that the newer aks-gpu-cuda // R595 line drops, is supported through Aug 2028, and is the branch the VHD driver prebake is built -// against. Legacy NCv1 (K80) keeps the separate "cuda" path with its pinned R470 driver. +// against. +// +// Legacy NCv1 (K80, Kepler) is intentionally NOT special-cased. NVIDIA R470 was the last branch to +// support Kepler data-center GPUs, and AKS never published an aks-gpu-cuda R470 image (that tag always +// 404'd), so no managed driver can actually run a K80. NCv1 therefore falls through to the default +// cuda-lts path like any other compute SKU. The hardware is EOL and effectively unused on AKS. func GetGPUDriverType(size string) string { if useGridV20Drivers(size) { return "grid-v20" @@ -1538,9 +1535,6 @@ func GetGPUDriverType(size string) string { if useGridDrivers(size) { return "grid" } - if isStandardNCv1(size) { - return "cuda" - } return "cuda-lts" } diff --git a/pkg/agent/baker_test.go b/pkg/agent/baker_test.go index 5478db71632..8090ed82e52 100644 --- a/pkg/agent/baker_test.go +++ b/pkg/agent/baker_test.go @@ -939,8 +939,11 @@ var _ = Describe("Test normalizeResourceGroupNameForLabel", func() { }) var _ = Describe("GetGPUDriverVersion", func() { - It("should use 470 with nc v1", func() { - Expect(GetGPUDriverVersion("standard_nc6")).To(Equal(datamodel.Nvidia470CudaDriverVersion)) + // NCv1 (K80, Kepler) is EOL: R470 was the last branch to support it and AKS never published an + // R470 image, so it is no longer special-cased and falls through to the default CUDA (cuda-lts) + // version like any other compute SKU. + It("should fall through to cuda-lts version for legacy nc v1 (K80)", func() { + Expect(GetGPUDriverVersion("standard_nc6")).To(Equal(datamodel.NvidiaCudaDriverVersion)) }) It("should use cuda with nc v3", func() { Expect(GetGPUDriverVersion("standard_nc6_v3")).To(Equal(datamodel.NvidiaCudaDriverVersion)) @@ -967,8 +970,9 @@ var _ = Describe("GetGPUDriverType", func() { It("should use cuda-lts with nc v3", func() { Expect(GetGPUDriverType("standard_nc6_v3")).To(Equal("cuda-lts")) }) - It("should keep cuda (legacy R470) with nc v1 (K80)", func() { - Expect(GetGPUDriverType("standard_nc6")).To(Equal("cuda")) + // NCv1 (K80, Kepler) is EOL and no longer special-cased; it falls through to cuda-lts. + It("should fall through to cuda-lts for legacy nc v1 (K80)", func() { + Expect(GetGPUDriverType("standard_nc6")).To(Equal("cuda-lts")) }) It("should use grid with nv v5", func() { Expect(GetGPUDriverType("standard_nv6ads_a10_v5")).To(Equal("grid")) diff --git a/pkg/agent/datamodel/gpu_components.go b/pkg/agent/datamodel/gpu_components.go index e2c8b33090d..cc6e263dd63 100644 --- a/pkg/agent/datamodel/gpu_components.go +++ b/pkg/agent/datamodel/gpu_components.go @@ -8,8 +8,6 @@ import ( "github.com/Azure/agentbaker/parts" ) -const Nvidia470CudaDriverVersion = "cuda-470.82.01" - //nolint:gochecknoglobals var ( NvidiaCudaDriverVersion string