Skip to content
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions e2e/scenario_gpu_managed_experience_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -783,3 +783,41 @@ func Test_Ubuntu2404_NvidiaDevicePluginRunning_MIG_Mixed(t *testing.T) {
},
})
}

func Test_Ubuntu2404_DraDriverNvidiaGpuRunning(t *testing.T) {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not supported on AZL ?

RunScenario(t, &Scenario{
Description: "Tests DRA driver works on Ubuntu 24.04 VHD with containerd v2",
Tags: Tags{
GPU: true,
},

Config: Config{
Cluster: ClusterKubenet,
SkipScriptlessNBC: true,
VHD: config.VHDUbuntu2404Gen2Containerd,
BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.AgentPoolProfile.VMSize = "Standard_NV6ads_A10_v5"
nbc.ConfigGPUDriverIfNeeded = true
nbc.EnableNvidia = true
nbc.EnableManagedGPUDRA = true
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.SKU.Name = to.Ptr("Standard_NV6ads_A10_v5")

// Enable the AKS VM extension for GPU nodes
extension, err := createVMExtensionLinuxAKSNode(t.Context(), vmss.Location)
require.NoError(t, err, "creating AKS VM extension")
vmss.Properties = addVMExtensionToVMSS(vmss.Properties, extension)
},
Validator: func(ctx context.Context, s *Scenario) {
containerdVersions := components.GetExpectedPackageVersions("containerd", "ubuntu", "r2404")
runcVersions := components.GetExpectedPackageVersions("runc", "ubuntu", "r2404")
ValidateContainerd2Properties(ctx, s, containerdVersions)
ValidateRuncVersion(ctx, s, runcVersions)
ValidateContainerRuntimePlugins(ctx, s)
ValidateDraDriverNvidiaGpuServiceRunning(ctx, s)
ValidateDRAWorkloadSchedulable(ctx, s)
},
},
})
}
96 changes: 96 additions & 0 deletions e2e/validators.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ import (
"github.com/stretchr/testify/require"
certv1 "k8s.io/api/certificates/v1"
corev1 "k8s.io/api/core/v1"
resourcev1 "k8s.io/api/resource/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
Expand Down Expand Up @@ -3111,3 +3113,97 @@ func ValidateSecondaryNICDualStack(ctx context.Context, s *Scenario, ifaceName s
require.Contains(s.T, result.stdout, "scope global",
"expected interface %s to have a global IPv6 address (not just link-local), got:\n%s", ifaceName, result.stdout)
}

func ValidateDraDriverNvidiaGpuServiceRunning(ctx context.Context, s *Scenario) {
s.T.Helper()
s.T.Logf("validating DRA driver NVIDIA GPU systemd service is running")

command := []string{
"set -ex",
"systemctl is-active dra-driver-nvidia-gpu.service",
"systemctl is-enabled dra-driver-nvidia-gpu.service",
}
execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "DRA driver NVIDIA GPU systemd service should be active and enabled")
}

func ValidateDRAWorkloadSchedulable(ctx context.Context, s *Scenario) {
s.T.Helper()
s.T.Logf("validating that DRA workloads can be scheduled")

time.Sleep(20 * time.Second) // Same delay as existing GPU tests

baseName := strings.ToLower(s.Runtime.VM.KubeName)
if len(baseName) > 40 {
baseName = baseName[:40]
}
baseName = strings.TrimRight(baseName, "-")
deviceClassName := fmt.Sprintf("gpu-nvidia-%s", baseName)
claimName := fmt.Sprintf("single-gpu-%s", baseName)
podClaimRefName := "gpu-claim"

_, err := s.Runtime.Kube.Typed.ResourceV1().DeviceClasses().Create(ctx, &resourcev1.DeviceClass{
ObjectMeta: metav1.ObjectMeta{
Name: deviceClassName,
},
Spec: resourcev1.DeviceClassSpec{},
}, metav1.CreateOptions{})
require.Truef(s.T, err == nil || apierrors.IsAlreadyExists(err), "failed to create DeviceClass %q: %v", deviceClassName, err)

_, err = s.Runtime.Kube.Typed.ResourceV1().ResourceClaims("default").Create(ctx, &resourcev1.ResourceClaim{
ObjectMeta: metav1.ObjectMeta{
Name: claimName,
Namespace: "default",
},
Spec: resourcev1.ResourceClaimSpec{
Devices: resourcev1.DeviceClaim{
Requests: []resourcev1.DeviceRequest{
{
Name: "gpu",
Exactly: &resourcev1.ExactDeviceRequest{
DeviceClassName: deviceClassName,
},
},
},
},
},
}, metav1.CreateOptions{})
require.Truef(s.T, err == nil || apierrors.IsAlreadyExists(err), "failed to create ResourceClaim %q: %v", claimName, err)

// Create a DRA test pod that consumes the ResourceClaim.
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("%s-dra-test", s.Runtime.VM.KubeName),
Namespace: "default",
},
Spec: corev1.PodSpec{
ResourceClaims: []corev1.PodResourceClaim{
{
Name: podClaimRefName,
ResourceClaimName: &claimName,
},
},
Containers: []corev1.Container{
{
Name: "dra-test-container",
Image: "mcr.microsoft.com/azuredocs/samples-tf-mnist-demo:gpu",
Args: []string{
"--max-steps", "1",
},
Resources: corev1.ResourceRequirements{
Claims: []corev1.ResourceClaim{
{
Name: podClaimRefName,
},
},
},
},
},
NodeSelector: map[string]string{
"kubernetes.io/hostname": s.Runtime.VM.KubeName,
},
},
}
ValidatePodRunning(ctx, s, pod)

s.T.Logf("GPU workload is schedulable and runs successfully")
}
1 change: 1 addition & 0 deletions parts/linux/cloud-init/artifacts/cse_cmd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ CONFIG_GPU_DRIVER_IF_NEEDED={{GetVariable "configGPUDriverIfNeeded"}}
ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED={{GetVariable "enableGPUDevicePluginIfNeeded"}}
MANAGED_GPU_EXPERIENCE_AFEC_ENABLED="{{IsManagedGPUExperienceAFECEnabled}}"
ENABLE_MANAGED_GPU="{{IsEnableManagedGPU}}"
ENABLE_MANAGED_GPU_DRA="{{IsEnableManagedGPUDRA}}"
NVIDIA_MIG_STRATEGY="{{GetMigStrategy}}"
CREDENTIAL_PROVIDER_DOWNLOAD_URL={{GetParameter "linuxCredentialProviderURL"}}
CONTAINERD_VERSION={{GetParameter "containerdVersion"}}
Expand Down
90 changes: 61 additions & 29 deletions parts/linux/cloud-init/artifacts/cse_config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1825,17 +1825,27 @@ configureManagedGPUExperience() {
if [ "${GPU_NODE}" != "true" ] || [ "${skip_nvidia_driver_install}" = "true" ]; then
return
fi
# RP validator ensures that only one of ENABLE_MANAGED_GPU_EXPERIENCE and ENABLE_MANAGED_GPU_EXPERIENCE_DRA is true at a time.
local managed_gpu_marker="/opt/azure/containers/managed-gpu-experience.enabled"
if [ "${ENABLE_MANAGED_GPU_EXPERIENCE}" = "true" ]; then
logs_to_events "AKS.CSE.installNvidiaManagedExpPkgFromCache" "installNvidiaManagedExpPkgFromCache" || exit $ERR_NVIDIA_DCGM_INSTALL
logs_to_events "AKS.CSE.startNvidiaManagedExpServices" "startNvidiaManagedExpServices" || exit $ERR_NVIDIA_DCGM_EXPORTER_FAIL
addKubeletNodeLabel "kubernetes.azure.com/dcgm-exporter=enabled"
mkdir -p "$(dirname "${managed_gpu_marker}")"
Comment thread
runzhen marked this conversation as resolved.
touch "${managed_gpu_marker}"
elif [ "${ENABLE_MANAGED_GPU_EXPERIENCE_DRA}" = "true" ]; then
logs_to_events "AKS.CSE.installNvidiaManagedExpPkgFromCache" "installNvidiaManagedExpPkgFromCache" || exit $ERR_NVIDIA_DCGM_INSTALL

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is all code duplication from the previous IF section, please clean up

# defer startNvidiaManagedExpServices() after kubelet starts
addKubeletNodeLabel "kubernetes.azure.com/dcgm-exporter=enabled"
mkdir -p "$(dirname "${managed_gpu_marker}")"
Comment thread
runzhen marked this conversation as resolved.
touch "${managed_gpu_marker}"
else
# EnableManagedGPUExperience is mutable, so services may have been
# installed on a previous CSE run. Stop them if they exist.
# systemctlDisableAndStop check if the service exists before attempting to stop it,
# so this is safe to call even if the services were never installed.
logs_to_events "AKS.CSE.stop.nvidia-device-plugin" "systemctlDisableAndStop nvidia-device-plugin"
logs_to_events "AKS.CSE.stop.dra-driver-nvidia-gpu" "systemctlDisableAndStop dra-driver-nvidia-gpu"
logs_to_events "AKS.CSE.stop.nvidia-dcgm" "systemctlDisableAndStop nvidia-dcgm"
logs_to_events "AKS.CSE.stop.nvidia-dcgm-exporter" "systemctlDisableAndStop nvidia-dcgm-exporter"
rm -f "${managed_gpu_marker}"
Expand All @@ -1844,52 +1854,74 @@ configureManagedGPUExperience() {

startNvidiaManagedExpServices() {
# 1. Start the nvidia-device-plugin service.
# Create systemd override directory to configure device plugin
NVIDIA_DEVICE_PLUGIN_OVERRIDE_DIR="/etc/systemd/system/nvidia-device-plugin.service.d"
mkdir -p "${NVIDIA_DEVICE_PLUGIN_OVERRIDE_DIR}"

if [ "${MIG_NODE}" = "true" ]; then
# Configure with MIG strategy for MIG nodes.
# MIG strategy controls how nvidia-device-plugin exposes MIG instances to Kubernetes:
# - "single": All MIG devices exposed as generic nvidia.com/gpu resources
# - "mixed": MIG devices exposed with specific types like nvidia.com/mig-1g.5gb
#
# We only use "mixed" when explicitly specified via NVIDIA_MIG_STRATEGY.
# Otherwise, we default to "single" which is the safer/simpler option.
# Note: NVIDIA_MIG_STRATEGY values from RP are "None", "Single", "Mixed".
# "None" and "Single" both result in using the "single" strategy.
if [ "${NVIDIA_MIG_STRATEGY}" = "Mixed" ]; then
MIG_STRATEGY_FLAG="--mig-strategy mixed"
else
# Default to "single" for "Single", "None", empty, or any other value
MIG_STRATEGY_FLAG="--mig-strategy single"
fi
if [ "${ENABLE_MANAGED_GPU_EXPERIENCE}" = "true" ]; then
# Create systemd override directory to configure device plugin
NVIDIA_DEVICE_PLUGIN_OVERRIDE_DIR="/etc/systemd/system/nvidia-device-plugin.service.d"
mkdir -p "${NVIDIA_DEVICE_PLUGIN_OVERRIDE_DIR}"

if [ "${MIG_NODE}" = "true" ]; then
# Configure with MIG strategy for MIG nodes.
# MIG strategy controls how nvidia-device-plugin exposes MIG instances to Kubernetes:
# - "single": All MIG devices exposed as generic nvidia.com/gpu resources
# - "mixed": MIG devices exposed with specific types like nvidia.com/mig-1g.5gb
#
# We only use "mixed" when explicitly specified via NVIDIA_MIG_STRATEGY.
# Otherwise, we default to "single" which is the safer/simpler option.
# Note: NVIDIA_MIG_STRATEGY values from RP are "None", "Single", "Mixed".
# "None" and "Single" both result in using the "single" strategy.
if [ "${NVIDIA_MIG_STRATEGY}" = "Mixed" ]; then
MIG_STRATEGY_FLAG="--mig-strategy mixed"
else
# Default to "single" for "Single", "None", empty, or any other value
MIG_STRATEGY_FLAG="--mig-strategy single"
fi

tee "${NVIDIA_DEVICE_PLUGIN_OVERRIDE_DIR}/10-device-plugin-config.conf" > /dev/null <<EOF
tee "${NVIDIA_DEVICE_PLUGIN_OVERRIDE_DIR}/10-device-plugin-config.conf" > /dev/null <<EOF
[Service]
ExecStart=
ExecStart=/usr/bin/nvidia-device-plugin ${MIG_STRATEGY_FLAG} --pass-device-specs
EOF
else
# Configure with pass-device-specs for non-MIG nodes
tee "${NVIDIA_DEVICE_PLUGIN_OVERRIDE_DIR}/10-device-plugin-config.conf" > /dev/null <<'EOF'
else
# Configure with pass-device-specs for non-MIG nodes
tee "${NVIDIA_DEVICE_PLUGIN_OVERRIDE_DIR}/10-device-plugin-config.conf" > /dev/null <<'EOF'
[Service]
ExecStart=
ExecStart=/usr/bin/nvidia-device-plugin --pass-device-specs
EOF
fi

# Reload systemd to pick up the override
systemctl daemon-reload

logs_to_events "AKS.CSE.start.nvidia-device-plugin" "systemctlEnableAndStart nvidia-device-plugin 30" || exit $ERR_GPU_DEVICE_PLUGIN_START_FAIL
fi

# Reload systemd to pick up the override
systemctl daemon-reload
# 2. Start the dra-driver-nvidia-gpu service.
if [ "${ENABLE_MANAGED_GPU_EXPERIENCE_DRA}" = "true" ]; then

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same since both ENABLE_MANAGED_GPU_EXPERIENCE and ENABLE_MANAGED_GPU_EXPERIENCE_DRA cannot be true at the same time I would make it clear from the code base. within this function, right now it looks like both can be enabled independantly.

DRA_DRIVER_NVIDIA_GPU_OVERRIDE_DIR="/etc/systemd/system/dra-driver-nvidia-gpu.service.d"
mkdir -p "${DRA_DRIVER_NVIDIA_GPU_OVERRIDE_DIR}"

tee "${DRA_DRIVER_NVIDIA_GPU_OVERRIDE_DIR}/10-dra-driver-nvidia-gpu.conf" > /dev/null <<EOF
[Unit]
Requires=kubelet.service
After=kubelet.service
[Service]
ExecStart=
ExecStart=/usr/bin/gpu-kubelet-plugin --kubeconfig /var/lib/kubelet/kubeconfig --container-driver-root / --image-name "" --node-name=${NODE_NAME}
EOF

logs_to_events "AKS.CSE.start.nvidia-device-plugin" "systemctlEnableAndStart nvidia-device-plugin 30" || exit $ERR_GPU_DEVICE_PLUGIN_START_FAIL
# Reload systemd to pick up the override
systemctl daemon-reload

logs_to_events "AKS.CSE.start.dra-driver-nvidia-gpu" "systemctlEnableAndStart dra-driver-nvidia-gpu 30" || exit $ERR_DRA_DRIVER_START_FAIL
fi

# 2. Start the nvidia-dcgm service.
# 3. Start the nvidia-dcgm service.
# DCGM is monitoring/telemetry and does not gate GPU workload scheduling, so start it without
# blocking node provisioning and treat a slow/failed start as non-fatal.
logs_to_events "AKS.CSE.start.nvidia-dcgm" "systemctlEnableAndStartNoBlock nvidia-dcgm 30" || echo "warning: nvidia-dcgm could not be enqueued; GPU monitoring will start asynchronously"

# 3. Start the nvidia-dcgm-exporter service.
# 4. Start the nvidia-dcgm-exporter service.
# Create systemd drop-in directory for nvidia-dcgm-exporter service
DCGM_EXPORTER_OVERRIDE_DIR="/etc/systemd/system/nvidia-dcgm-exporter.service.d"
mkdir -p "${DCGM_EXPORTER_OVERRIDE_DIR}"
Expand Down
1 change: 1 addition & 0 deletions parts/linux/cloud-init/artifacts/cse_helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ ERR_ENABLE_MANAGED_GPU_EXPERIENCE=123 # Error confguring managed GPU experience
ERR_VHD_BUILD_ERROR=125 # Reserved for VHD CI exit conditions

ERR_NODE_EXPORTER_START_FAIL=128 # Error starting or enabling node-exporter service
ERR_DRA_DRIVER_START_FAIL=129 # dra-driver-nvidia-gpu could not be started by systemctl

ERR_SWAP_CREATE_FAIL=130 # Error allocating swap file
ERR_SWAP_CREATE_INSUFFICIENT_DISK_SPACE=131 # Error insufficient disk space for swap file creation
Expand Down
11 changes: 11 additions & 0 deletions parts/linux/cloud-init/artifacts/cse_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,12 @@ function nodePrep {
ENABLE_MANAGED_GPU_EXPERIENCE="true"
fi

if [ "${ENABLE_MANAGED_GPU_DRA,,}" = "true" ]; then
ENABLE_MANAGED_GPU_EXPERIENCE_DRA="true"
fi
Comment thread
runzhen marked this conversation as resolved.

echo "Fully Managed GPU device plugin mode: ${ENABLE_MANAGED_GPU_EXPERIENCE}, DRA mode: ${ENABLE_MANAGED_GPU_EXPERIENCE_DRA}"

logs_to_events "AKS.CSE.configureManagedGPUExperience" configureManagedGPUExperience || exit $ERR_ENABLE_MANAGED_GPU_EXPERIENCE

echo $(date),$(hostname), "End configuring GPU drivers"
Expand Down Expand Up @@ -583,6 +589,11 @@ function nodePrep {

checkServiceHealth kubelet || exit $ERR_KUBELET_FAIL

# defer starting DRA driver services after kubelet.
if [ "${ENABLE_MANAGED_GPU_EXPERIENCE_DRA}" = "true" ]; then
logs_to_events "AKS.CSE.startNvidiaManagedExpServices" "startNvidiaManagedExpServices" || exit $?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why only the _DRA flavor logic is present under this file. I would have expeced that the normal MAanged GPU flow would also call. startNvidiaManagedExpServices

fi

if systemctl cat aks-log-collector.timer &>/dev/null; then
systemctlEnableAndStartNoBlock aks-log-collector.timer 30 || echo "Warning: Could not start aks-log-collector.timer"
else
Expand Down
14 changes: 12 additions & 2 deletions parts/linux/cloud-init/artifacts/mariner/cse_install_mariner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -418,12 +418,20 @@ isPackageInstalled() {
}

managedGPUPackageList() {
packages=(
nvidia-device-plugin
local packages=(
datacenter-gpu-manager-4-core
datacenter-gpu-manager-4-proprietary
dcgm-exporter
)
Comment thread
runzhen marked this conversation as resolved.

if [ "${ENABLE_MANAGED_GPU_EXPERIENCE:-false}" = "true" ]; then

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why this change ? my understand was that datacenter-gpu-manager-4-core, datacenter-gpu-manager-4-proprietary and dcgm-exporter were also part of the of Managed Experience. did that change recently ?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok I get it now, only one of the two can be enabled at the same time.

It would be clearer is this was a switch case or if/else since this would make it clearer that it's one of the other.

packages+=(nvidia-device-plugin)
fi

if [ "${ENABLE_MANAGED_GPU_EXPERIENCE_DRA:-false}" = "true" ]; then
Comment thread
runzhen marked this conversation as resolved.
packages+=(dra-driver-nvidia-gpu)
fi

echo "${packages[@]}"
Comment thread
runzhen marked this conversation as resolved.
}

Expand All @@ -435,6 +443,8 @@ installNvidiaManagedExpPkgFromCache() {

# Ensure kubelet device-plugins directory exists BEFORE package installation
mkdir -p /var/lib/kubelet/device-plugins
mkdir -p /var/lib/kubelet/plugins_registry
mkdir -p /var/lib/kubelet/plugins

for packageName in $(managedGPUPackageList); do
downloadDir="$(getPackageDownloadDir "${packageName}")"
Expand Down
14 changes: 12 additions & 2 deletions parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -179,18 +179,28 @@ isPackageInstalled() {
}

managedGPUPackageList() {
packages=(
nvidia-device-plugin
local packages=(
datacenter-gpu-manager-4-core
datacenter-gpu-manager-4-proprietary
dcgm-exporter
)

if [ "${ENABLE_MANAGED_GPU_EXPERIENCE:-false}" = "true" ]; then
packages+=(nvidia-device-plugin)
fi

if [ "${ENABLE_MANAGED_GPU_EXPERIENCE_DRA:-false}" = "true" ]; then
Comment thread
runzhen marked this conversation as resolved.
packages+=(dra-driver-nvidia-gpu)
fi

echo "${packages[@]}"
Comment thread
runzhen marked this conversation as resolved.
}

installNvidiaManagedExpPkgFromCache() {
# Ensure kubelet device-plugins directory exists BEFORE package installation
mkdir -p /var/lib/kubelet/device-plugins
mkdir -p /var/lib/kubelet/plugins_registry
mkdir -p /var/lib/kubelet/plugins

for packageName in $(managedGPUPackageList); do
downloadDir="/opt/${packageName}/downloads"
Expand Down
3 changes: 3 additions & 0 deletions pkg/agent/baker.go
Original file line number Diff line number Diff line change
Expand Up @@ -1369,6 +1369,9 @@ func getContainerServiceFuncMap(config *datamodel.NodeBootstrappingConfiguration
"IsEnableManagedGPU": func() bool {
return config.EnableManagedGPU
},
"IsEnableManagedGPUDRA": func() bool {
return config.EnableManagedGPUDRA
},
"EnableIMDSRestriction": func() bool {
return config.EnableIMDSRestriction
},
Expand Down
Loading
Loading