Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions parts/common/components.json
Original file line number Diff line number Diff line change
Expand Up @@ -736,10 +736,10 @@
],
"GPUContainerImages": [
{
"downloadURL": "mcr.microsoft.com/aks/aks-gpu-cuda:*",
"downloadURL": "mcr.microsoft.com/aks/aks-gpu-cuda-lts:*",
"gpuVersion": {
"renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-cuda",
"latestVersion": "580.126.09-20260126030251"
"renovateTag": "registry=https://mcr.microsoft.com, name=aks/aks-gpu-cuda-lts",
"latestVersion": "580.159.04-20260629214430"
Comment thread
ganeshkumarashok marked this conversation as resolved.
}
},
{
Expand Down
3 changes: 2 additions & 1 deletion parts/linux/cloud-init/artifacts/acl/cse_install_acl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ installGPUDriverSysext() {
#
# NVIDIA_GPU_DRIVER_TYPE is set by AgentBaker based on the GPU SKU maps in
# gpu_components.go. Converged sizes get "grid"; RTX PRO 6000 BSE v6 gets
# "grid-v20" (Ubuntu-only, rejected below); all others get "cuda".
# "grid-v20" (Ubuntu-only, rejected below); modern CUDA SKUs get "cuda-lts" and legacy
# NCv1 gets "cuda". Only grid vs non-grid matters here, so both take the CUDA path below.
# Legacy GPUs (T4, V100) require proprietary CUDA drivers; A100+ use NVIDIA open drivers.
local vm_sku
vm_sku=$(get_compute_sku)
Expand Down
11 changes: 9 additions & 2 deletions parts/linux/cloud-init/artifacts/cse_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1301,13 +1301,20 @@ validateGPUDrivers() {
# before enabling consume. Observability only; no behavior change.
logGPUDriverPrebakeReadiness() {
local marker="${GPU_DKMS_MARKER_FILE:-/opt/azure/aks-gpu/dkms-marker}"
local marker_present=false driver_kind_match=false m_kind
local marker_present=false driver_kind_match=false m_kind node_kind
# Map the AgentBaker driver-type to the aks-gpu marker's driver_kind (the container's DRIVER_KIND
# build arg): image variants "cuda-lts" and "grid-v20" bake markers as "cuda"/"grid" respectively.
case "${NVIDIA_GPU_DRIVER_TYPE}" in
cuda*) node_kind=cuda ;;
grid*) node_kind=grid ;;
*) node_kind="${NVIDIA_GPU_DRIVER_TYPE}" ;;
esac
if [ -f "${marker}" ]; then
marker_present=true
m_kind="$(sed -n 's/^driver_kind=//p' "${marker}" | head -n1)"
# require both sides non-empty so a marker missing driver_kind= (or an unset
# NVIDIA_GPU_DRIVER_TYPE) does not falsely report a match (empty = empty).
if [ -n "${m_kind}" ] && [ -n "${NVIDIA_GPU_DRIVER_TYPE}" ] && [ "${m_kind}" = "${NVIDIA_GPU_DRIVER_TYPE}" ]; then
if [ -n "${m_kind}" ] && [ -n "${node_kind}" ] && [ "${m_kind}" = "${node_kind}" ]; then
driver_kind_match=true
fi
fi
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ downloadGPUDrivers() {
#
# NVIDIA_GPU_DRIVER_TYPE is set by AgentBaker based on the GPU SKU maps in
# gpu_components.go. Converged sizes get "grid"; RTX PRO 6000 BSE v6 gets
# "grid-v20" (Ubuntu-only, rejected below); all others get "cuda".
# "grid-v20" (Ubuntu-only, rejected below); modern CUDA SKUs get "cuda-lts" and legacy
# NCv1 gets "cuda". Only grid vs non-grid matters here, so both take the CUDA path below.
# Legacy GPUs (T4, V100) require proprietary CUDA drivers; A100+ use NVIDIA open drivers.
KERNEL_VERSION=$(uname -r | sed 's/-/./g')
VM_SKU=$(get_compute_sku)
Expand Down
11 changes: 10 additions & 1 deletion pkg/agent/baker.go
Original file line number Diff line number Diff line change
Expand Up @@ -1525,14 +1525,23 @@ func GetAKSGPUImageSHA(size string) string {
return datamodel.AKSGPUCudaVersionSuffix
}

// GetGPUDriverType maps a GPU VM size to the aks-gpu image variant used to install its driver.
// The value becomes NVIDIA_GPU_DRIVER_TYPE at provision time, which selects the container image
// mcr.microsoft.com/aks/aks-gpu-<type>. Modern CUDA compute SKUs (T4, V100, A100, H100, H200, ...)
// use the R580 LTS image (aks-gpu-cuda-lts): it retains Volta/V100 support that the newer aks-gpu-cuda
// R595 line drops, is supported through Aug 2028, and is the branch the VHD driver prebake is built
// against. Legacy NCv1 (K80) keeps the separate "cuda" path with its pinned R470 driver.
func GetGPUDriverType(size string) string {
Comment thread
ganeshkumarashok marked this conversation as resolved.
if useGridV20Drivers(size) {
return "grid-v20"
}
if useGridDrivers(size) {
return "grid"
}
return "cuda"
if isStandardNCv1(size) {
return "cuda"
}
return "cuda-lts"
Comment thread
ganeshkumarashok marked this conversation as resolved.
}

func GPUNeedsFabricManager(size string) bool {
Expand Down
13 changes: 8 additions & 5 deletions pkg/agent/baker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -964,8 +964,11 @@ var _ = Describe("GetGPUDriverVersion", func() {

var _ = Describe("GetGPUDriverType", func() {

It("should use cuda with nc v3", func() {
Expect(GetGPUDriverType("standard_nc6_v3")).To(Equal("cuda"))
It("should use cuda-lts with nc v3", func() {
Expect(GetGPUDriverType("standard_nc6_v3")).To(Equal("cuda-lts"))
})
It("should keep cuda (legacy R470) with nc v1 (K80)", func() {
Expect(GetGPUDriverType("standard_nc6")).To(Equal("cuda"))
})
It("should use grid with nv v5", func() {
Expect(GetGPUDriverType("standard_nv6ads_a10_v5")).To(Equal("grid"))
Expand All @@ -979,8 +982,8 @@ var _ = Describe("GetGPUDriverType", func() {
Expect(GetGPUDriverType("Standard_NC320lds_xl_RTXPRO6000BSE_v6")).To(Equal("grid-v20"))
})
// NV V1 SKUs were retired in September 2023, leaving this test just for safety
It("should use cuda with nv v1", func() {
Expect(GetGPUDriverType("standard_nv6")).To(Equal("cuda"))
It("should use cuda-lts with nv v1", func() {
Expect(GetGPUDriverType("standard_nv6")).To(Equal("cuda-lts"))
})
})

Expand Down Expand Up @@ -1118,7 +1121,7 @@ var _ = Describe("getLinuxNodeCSECommand", func() {
vars := decodeCSEVars(cseCmd)
Expect(vars).To(HaveKeyWithValue("GPU_NODE", "true"))
Expect(vars).To(HaveKeyWithValue("CONFIG_GPU_DRIVER_IF_NEEDED", "true"))
Expect(vars).To(HaveKeyWithValue("GPU_DRIVER_TYPE", "cuda"))
Expect(vars).To(HaveKeyWithValue("GPU_DRIVER_TYPE", "cuda-lts"))
})

It("should handle custom cloud environment", func() {
Expand Down
2 changes: 1 addition & 1 deletion pkg/agent/datamodel/gpu_components.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func LoadConfig() error {
// repos sharing a prefix (e.g. "aks-gpu-grid" vs "aks-gpu-grid-v20") are not
// confused by substring matching.
switch gpuImageRepo(image.DownloadURL) {
case "aks-gpu-cuda":
case "aks-gpu-cuda-lts":
NvidiaCudaDriverVersion = version
AKSGPUCudaVersionSuffix = suffix
case "aks-gpu-grid":
Expand Down
2 changes: 1 addition & 1 deletion pkg/agent/datamodel/gpu_components_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ func TestLoadConfig(t *testing.T) {
// LoadConfig switch that maps each repo to its own driver version/suffix.
func TestGPUImageRepo(t *testing.T) {
cases := map[string]string{
"mcr.microsoft.com/aks/aks-gpu-cuda:*": "aks-gpu-cuda",
"mcr.microsoft.com/aks/aks-gpu-cuda-lts:*": "aks-gpu-cuda-lts",
"mcr.microsoft.com/aks/aks-gpu-grid:*": "aks-gpu-grid",
"mcr.microsoft.com/aks/aks-gpu-grid-v20:*": "aks-gpu-grid-v20",
"mcr.microsoft.com/aks/aks-gpu-grid-v20:595.58.03-1": "aks-gpu-grid-v20",
Expand Down
12 changes: 12 additions & 0 deletions spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,18 @@ Describe 'cse_config.sh'
rm -f "$marker"
End

It 'matches a cuda marker for a cuda-lts (R580 LTS) node: driver-type maps to the aks-gpu driver_kind'
marker="$(mktemp)"
printf 'driver_kind=cuda\n' > "$marker"
GPU_DKMS_MARKER_FILE="$marker"
NVIDIA_GPU_DRIVER_TYPE="cuda-lts"
When call logGPUDriverPrebakeReadiness
The output should include "marker_present=true"
The output should include "driver_kind_match=true"
The output should include "driver_type=cuda-lts"
rm -f "$marker"
End

It 'reports driver_kind_match=false when a CUDA marker is on a GRID node (not skip-ready)'
marker="$(mktemp)"
printf 'driver_kind=cuda\n' > "$marker"
Expand Down
4 changes: 2 additions & 2 deletions vhdbuilder/packer/install-dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,7 @@ if [ $OS = $UBUNTU_OS_NAME ] && [ "$(isARM64)" -ne 1 ]; then # No ARM64 SKU wit
# shellcheck disable=SC2001
imageName=$(echo "$downloadURL" | sed 's/:.*$//')

if [ "$imageName" = "mcr.microsoft.com/aks/aks-gpu-cuda" ]; then
if [ "$imageName" = "mcr.microsoft.com/aks/aks-gpu-cuda-lts" ]; then
latestVersion=$(echo "${imageToBePulled}" | jq -r '.gpuVersion.latestVersion')
NVIDIA_DRIVER_IMAGE="$imageName"
NVIDIA_DRIVER_IMAGE_TAG="$latestVersion"
Expand All @@ -726,7 +726,7 @@ if [ $OS = $UBUNTU_OS_NAME ] && [ "$(isARM64)" -ne 1 ]; then # No ARM64 SKU wit

# Check if the NVIDIA_DRIVER_IMAGE and NVIDIA_DRIVER_IMAGE_TAG were found
if [ -z "$NVIDIA_DRIVER_IMAGE" ] || [ -z "$NVIDIA_DRIVER_IMAGE_TAG" ]; then
echo "Error: Unable to find aks-gpu-cuda image in components.json"
echo "Error: Unable to find aks-gpu-cuda-lts image in components.json"
exit 1
fi

Expand Down
Loading