Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions parts/linux/cloud-init/artifacts/acl/cse_install_acl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,9 @@ installGPUDriverSysext() {
#
# NVIDIA_GPU_DRIVER_TYPE is set by AgentBaker based on the GPU SKU maps in
# gpu_components.go. Converged sizes get "grid"; RTX PRO 6000 BSE v6 gets
# "grid-v20" (Ubuntu-only, rejected below); modern CUDA SKUs get "cuda-lts" and legacy
# NCv1 gets "cuda". Only grid vs non-grid matters here, so both take the CUDA path below.
# "grid-v20" (Ubuntu-only, rejected below); all other CUDA SKUs (including legacy
# NCv1/K80) get "cuda-lts". Only grid vs non-grid matters here, so the CUDA path
# below handles every non-grid SKU.
# Legacy GPUs (T4, V100) require proprietary CUDA drivers; A100+ use NVIDIA open drivers.
local vm_sku
vm_sku=$(get_compute_sku)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,9 @@ downloadGPUDrivers() {
#
# NVIDIA_GPU_DRIVER_TYPE is set by AgentBaker based on the GPU SKU maps in
# gpu_components.go. Converged sizes get "grid"; RTX PRO 6000 BSE v6 gets
# "grid-v20" (Ubuntu-only, rejected below); modern CUDA SKUs get "cuda-lts" and legacy
# NCv1 gets "cuda". Only grid vs non-grid matters here, so both take the CUDA path below.
# "grid-v20" (Ubuntu-only, rejected below); all other CUDA SKUs (including legacy
# NCv1/K80) get "cuda-lts". Only grid vs non-grid matters here, so the CUDA path
# below handles every non-grid SKU.
# Legacy GPUs (T4, V100) require proprietary CUDA drivers; A100+ use NVIDIA open drivers.
KERNEL_VERSION=$(uname -r | sed 's/-/./g')
VM_SKU=$(get_compute_sku)
Expand Down
20 changes: 7 additions & 13 deletions pkg/agent/baker.go
Original file line number Diff line number Diff line change
Expand Up @@ -1494,17 +1494,9 @@ func GetGPUDriverVersion(size string) string {
if useGridDrivers(size) {
return datamodel.NvidiaGridDriverVersion
}
if isStandardNCv1(size) {
return datamodel.Nvidia470CudaDriverVersion
}
return datamodel.NvidiaCudaDriverVersion
}

func isStandardNCv1(size string) bool {
tmp := strings.ToLower(size)
return strings.HasPrefix(tmp, "standard_nc") && !strings.Contains(tmp, "_v")
}

func useGridDrivers(size string) bool {
return datamodel.ConvergedGPUDriverSizes[strings.ToLower(size)]
}
Expand All @@ -1527,20 +1519,22 @@ func GetAKSGPUImageSHA(size string) string {

// GetGPUDriverType maps a GPU VM size to the aks-gpu image variant used to install its driver.
// The value becomes NVIDIA_GPU_DRIVER_TYPE at provision time, which selects the container image
// mcr.microsoft.com/aks/aks-gpu-<type>. Modern CUDA compute SKUs (T4, V100, A100, H100, H200, ...)
// mcr.microsoft.com/aks/aks-gpu-<type>. All CUDA compute SKUs (T4, V100, A100, H100, H200, ...)
// use the R580 LTS image (aks-gpu-cuda-lts): it retains Volta/V100 support that the newer aks-gpu-cuda
// R595 line drops, is supported through Aug 2028, and is the branch the VHD driver prebake is built
// against. Legacy NCv1 (K80) keeps the separate "cuda" path with its pinned R470 driver.
// against.
//
// Legacy NCv1 (K80, Kepler) is intentionally NOT special-cased. NVIDIA R470 was the last branch to
// support Kepler data-center GPUs, and AKS never published an aks-gpu-cuda R470 image (that tag always
// 404'd), so no managed driver can actually run a K80. NCv1 therefore falls through to the default
// cuda-lts path like any other compute SKU. The hardware is EOL and effectively unused on AKS.
func GetGPUDriverType(size string) string {
if useGridV20Drivers(size) {
return "grid-v20"
}
if useGridDrivers(size) {
return "grid"
}
if isStandardNCv1(size) {
return "cuda"
}
return "cuda-lts"
}

Expand Down
12 changes: 8 additions & 4 deletions pkg/agent/baker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -939,8 +939,11 @@ var _ = Describe("Test normalizeResourceGroupNameForLabel", func() {
})

var _ = Describe("GetGPUDriverVersion", func() {
It("should use 470 with nc v1", func() {
Expect(GetGPUDriverVersion("standard_nc6")).To(Equal(datamodel.Nvidia470CudaDriverVersion))
// NCv1 (K80, Kepler) is EOL: R470 was the last branch to support it and AKS never published an
// R470 image, so it is no longer special-cased and falls through to the default CUDA (cuda-lts)
// version like any other compute SKU.
It("should fall through to cuda-lts version for legacy nc v1 (K80)", func() {
Expect(GetGPUDriverVersion("standard_nc6")).To(Equal(datamodel.NvidiaCudaDriverVersion))
})
It("should use cuda with nc v3", func() {
Expect(GetGPUDriverVersion("standard_nc6_v3")).To(Equal(datamodel.NvidiaCudaDriverVersion))
Expand All @@ -967,8 +970,9 @@ var _ = Describe("GetGPUDriverType", func() {
It("should use cuda-lts with nc v3", func() {
Expect(GetGPUDriverType("standard_nc6_v3")).To(Equal("cuda-lts"))
})
It("should keep cuda (legacy R470) with nc v1 (K80)", func() {
Expect(GetGPUDriverType("standard_nc6")).To(Equal("cuda"))
// NCv1 (K80, Kepler) is EOL and no longer special-cased; it falls through to cuda-lts.
It("should fall through to cuda-lts for legacy nc v1 (K80)", func() {
Expect(GetGPUDriverType("standard_nc6")).To(Equal("cuda-lts"))
})
It("should use grid with nv v5", func() {
Expect(GetGPUDriverType("standard_nv6ads_a10_v5")).To(Equal("grid"))
Expand Down
2 changes: 0 additions & 2 deletions pkg/agent/datamodel/gpu_components.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ import (
"github.com/Azure/agentbaker/parts"
)

const Nvidia470CudaDriverVersion = "cuda-470.82.01"

//nolint:gochecknoglobals
var (
NvidiaCudaDriverVersion string
Expand Down
Loading