Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/renovate.json
Original file line number Diff line number Diff line change
Expand Up @@ -572,11 +572,11 @@
},
{
"matchPackageNames": [
"aks/aks-gpu-cuda"
"aks/aks-gpu-cuda-lts"
],
"groupName": "nvidia-gpu-cuda",
"groupName": "nvidia-gpu-cuda-lts",
"versioning": "regex:^(?<major>\\d+)\\.(?<minor>\\d+)\\.(?<patch>\\d+)-(?<prerelease>\\d{14})$",
"automerge": true,
"automerge": false,
"enabled": true,
"ignoreUnstable": false
},
Expand Down
6 changes: 3 additions & 3 deletions parts/common/components.json
Original file line number Diff line number Diff line change
Expand Up @@ -736,10 +736,10 @@
],
"GPUContainerImages": [
{
"downloadURL": "mcr.microsoft.com/aks/aks-gpu-cuda:*",
"downloadURL": "mcr.microsoft.com/aks/aks-gpu-cuda-lts:*",
"gpuVersion": {
"renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-cuda",
"latestVersion": "580.126.09-20260126030251"
"renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-cuda-lts",
"latestVersion": "580.159.04-20260629214430"
Comment thread
ganeshkumarashok marked this conversation as resolved.
}
},
{
Expand Down
3 changes: 2 additions & 1 deletion parts/linux/cloud-init/artifacts/acl/cse_install_acl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ installGPUDriverSysext() {
#
# NVIDIA_GPU_DRIVER_TYPE is set by AgentBaker based on the GPU SKU maps in
# gpu_components.go. Converged sizes get "grid"; RTX PRO 6000 BSE v6 gets
# "grid-v20" (Ubuntu-only, rejected below); all others get "cuda".
# "grid-v20" (Ubuntu-only, rejected below); modern CUDA SKUs get "cuda-lts" and legacy
# NCv1 gets "cuda". Only grid vs non-grid matters here, so both take the CUDA path below.
# Legacy GPUs (T4, V100) require proprietary CUDA drivers; A100+ use NVIDIA open drivers.
local vm_sku
vm_sku=$(get_compute_sku)
Expand Down
11 changes: 9 additions & 2 deletions parts/linux/cloud-init/artifacts/cse_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1301,13 +1301,20 @@ validateGPUDrivers() {
# before enabling consume. Observability only; no behavior change.
logGPUDriverPrebakeReadiness() {
local marker="${GPU_DKMS_MARKER_FILE:-/opt/azure/aks-gpu/dkms-marker}"
local marker_present=false driver_kind_match=false m_kind
local marker_present=false driver_kind_match=false m_kind node_kind
# Map the AgentBaker driver-type to the aks-gpu marker's driver_kind (the container's DRIVER_KIND
# build arg): image variants "cuda-lts" and "grid-v20" bake markers as "cuda"/"grid" respectively.
case "${NVIDIA_GPU_DRIVER_TYPE}" in
cuda*) node_kind=cuda ;;
grid*) node_kind=grid ;;
*) node_kind="${NVIDIA_GPU_DRIVER_TYPE}" ;;
esac
if [ -f "${marker}" ]; then
marker_present=true
m_kind="$(sed -n 's/^driver_kind=//p' "${marker}" | head -n1)"
# require both sides non-empty so a marker missing driver_kind= (or an unset
# NVIDIA_GPU_DRIVER_TYPE) does not falsely report a match (empty = empty).
if [ -n "${m_kind}" ] && [ -n "${NVIDIA_GPU_DRIVER_TYPE}" ] && [ "${m_kind}" = "${NVIDIA_GPU_DRIVER_TYPE}" ]; then
if [ -n "${m_kind}" ] && [ -n "${node_kind}" ] && [ "${m_kind}" = "${node_kind}" ]; then
driver_kind_match=true
fi
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ downloadGPUDrivers() {
#
# NVIDIA_GPU_DRIVER_TYPE is set by AgentBaker based on the GPU SKU maps in
# gpu_components.go. Converged sizes get "grid"; RTX PRO 6000 BSE v6 gets
# "grid-v20" (Ubuntu-only, rejected below); all others get "cuda".
# "grid-v20" (Ubuntu-only, rejected below); modern CUDA SKUs get "cuda-lts" and legacy
# NCv1 gets "cuda". Only grid vs non-grid matters here, so both take the CUDA path below.
# Legacy GPUs (T4, V100) require proprietary CUDA drivers; A100+ use NVIDIA open drivers.
KERNEL_VERSION=$(uname -r | sed 's/-/./g')
VM_SKU=$(get_compute_sku)
Expand Down
11 changes: 10 additions & 1 deletion pkg/agent/baker.go
Original file line number Diff line number Diff line change
Expand Up @@ -1525,14 +1525,23 @@ func GetAKSGPUImageSHA(size string) string {
return datamodel.AKSGPUCudaVersionSuffix
}

// GetGPUDriverType maps a GPU VM size to the aks-gpu image variant used to install its driver.
// The value becomes NVIDIA_GPU_DRIVER_TYPE at provision time, which selects the container image
// mcr.microsoft.com/aks/aks-gpu-<type>. Modern CUDA compute SKUs (T4, V100, A100, H100, H200, ...)
// use the R580 LTS image (aks-gpu-cuda-lts): it retains Volta/V100 support that the newer aks-gpu-cuda
// R595 line drops, is supported through Aug 2028, and is the branch the VHD driver prebake is built
// against. Legacy NCv1 (K80) keeps the separate "cuda" path with its pinned R470 driver.
func GetGPUDriverType(size string) string {
Comment thread
ganeshkumarashok marked this conversation as resolved.
if useGridV20Drivers(size) {
return "grid-v20"
}
if useGridDrivers(size) {
return "grid"
}
return "cuda"
if isStandardNCv1(size) {
return "cuda"
}
return "cuda-lts"
Comment thread
ganeshkumarashok marked this conversation as resolved.
}

func GPUNeedsFabricManager(size string) bool {
Expand Down
13 changes: 8 additions & 5 deletions pkg/agent/baker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -964,8 +964,11 @@ var _ = Describe("GetGPUDriverVersion", func() {

var _ = Describe("GetGPUDriverType", func() {

It("should use cuda with nc v3", func() {
Expect(GetGPUDriverType("standard_nc6_v3")).To(Equal("cuda"))
It("should use cuda-lts with nc v3", func() {
Expect(GetGPUDriverType("standard_nc6_v3")).To(Equal("cuda-lts"))
})
It("should keep cuda (legacy R470) with nc v1 (K80)", func() {
Expect(GetGPUDriverType("standard_nc6")).To(Equal("cuda"))
})
It("should use grid with nv v5", func() {
Expect(GetGPUDriverType("standard_nv6ads_a10_v5")).To(Equal("grid"))
Expand All @@ -979,8 +982,8 @@ var _ = Describe("GetGPUDriverType", func() {
Expect(GetGPUDriverType("Standard_NC320lds_xl_RTXPRO6000BSE_v6")).To(Equal("grid-v20"))
})
// NV V1 SKUs were retired in September 2023, leaving this test just for safety
It("should use cuda with nv v1", func() {
Expect(GetGPUDriverType("standard_nv6")).To(Equal("cuda"))
It("should use cuda-lts with nv v1", func() {
Expect(GetGPUDriverType("standard_nv6")).To(Equal("cuda-lts"))
})
})

Expand Down Expand Up @@ -1118,7 +1121,7 @@ var _ = Describe("getLinuxNodeCSECommand", func() {
vars := decodeCSEVars(cseCmd)
Expect(vars).To(HaveKeyWithValue("GPU_NODE", "true"))
Expect(vars).To(HaveKeyWithValue("CONFIG_GPU_DRIVER_IF_NEEDED", "true"))
Expect(vars).To(HaveKeyWithValue("GPU_DRIVER_TYPE", "cuda"))
Expect(vars).To(HaveKeyWithValue("GPU_DRIVER_TYPE", "cuda-lts"))
})

It("should handle custom cloud environment", func() {
Expand Down
2 changes: 1 addition & 1 deletion pkg/agent/datamodel/gpu_components.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func LoadConfig() error {
// repos sharing a prefix (e.g. "aks-gpu-grid" vs "aks-gpu-grid-v20") are not
// confused by substring matching.
switch gpuImageRepo(image.DownloadURL) {
case "aks-gpu-cuda":
case "aks-gpu-cuda-lts":
NvidiaCudaDriverVersion = version
AKSGPUCudaVersionSuffix = suffix
case "aks-gpu-grid":
Expand Down
2 changes: 1 addition & 1 deletion pkg/agent/datamodel/gpu_components_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ func TestLoadConfig(t *testing.T) {
// LoadConfig switch that maps each repo to its own driver version/suffix.
func TestGPUImageRepo(t *testing.T) {
cases := map[string]string{
"mcr.microsoft.com/aks/aks-gpu-cuda:*": "aks-gpu-cuda",
"mcr.microsoft.com/aks/aks-gpu-cuda-lts:*": "aks-gpu-cuda-lts",
"mcr.microsoft.com/aks/aks-gpu-grid:*": "aks-gpu-grid",
"mcr.microsoft.com/aks/aks-gpu-grid-v20:*": "aks-gpu-grid-v20",
"mcr.microsoft.com/aks/aks-gpu-grid-v20:595.58.03-1": "aks-gpu-grid-v20",
Expand Down
12 changes: 12 additions & 0 deletions spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,18 @@ Describe 'cse_config.sh'
rm -f "$marker"
End

It 'matches a cuda marker for a cuda-lts (R580 LTS) node: driver-type maps to the aks-gpu driver_kind'
marker="$(mktemp)"
printf 'driver_kind=cuda\n' > "$marker"
GPU_DKMS_MARKER_FILE="$marker"
NVIDIA_GPU_DRIVER_TYPE="cuda-lts"
When call logGPUDriverPrebakeReadiness
The output should include "marker_present=true"
The output should include "driver_kind_match=true"
The output should include "driver_type=cuda-lts"
rm -f "$marker"
End

It 'reports driver_kind_match=false when a CUDA marker is on a GRID node (not skip-ready)'
marker="$(mktemp)"
printf 'driver_kind=cuda\n' > "$marker"
Expand Down
4 changes: 2 additions & 2 deletions vhdbuilder/packer/install-dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,7 @@ if [ $OS = $UBUNTU_OS_NAME ] && [ "$(isARM64)" -ne 1 ]; then # No ARM64 SKU wit
# shellcheck disable=SC2001
imageName=$(echo "$downloadURL" | sed 's/:.*$//')

if [ "$imageName" = "mcr.microsoft.com/aks/aks-gpu-cuda" ]; then
if [ "$imageName" = "mcr.microsoft.com/aks/aks-gpu-cuda-lts" ]; then
latestVersion=$(echo "${imageToBePulled}" | jq -r '.gpuVersion.latestVersion')
NVIDIA_DRIVER_IMAGE="$imageName"
NVIDIA_DRIVER_IMAGE_TAG="$latestVersion"
Expand All @@ -726,7 +726,7 @@ if [ $OS = $UBUNTU_OS_NAME ] && [ "$(isARM64)" -ne 1 ]; then # No ARM64 SKU wit

# Check if the NVIDIA_DRIVER_IMAGE and NVIDIA_DRIVER_IMAGE_TAG were found
if [ -z "$NVIDIA_DRIVER_IMAGE" ] || [ -z "$NVIDIA_DRIVER_IMAGE_TAG" ]; then
echo "Error: Unable to find aks-gpu-cuda image in components.json"
echo "Error: Unable to find aks-gpu-cuda-lts image in components.json"
exit 1
fi

Expand Down
Loading