-
Notifications
You must be signed in to change notification settings - Fork 265
feat: install DRA nvidia gpu plugin #8797
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
a55db8e
707adcd
b21d1f4
fe7d159
df41b98
a1dabbe
e5a1fbd
6eeba1e
c99392e
3db16e7
f0aa469
6e7a1fb
1478e0b
ff4ed88
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1826,16 +1826,21 @@ configureManagedGPUExperience() { | |
| return | ||
| fi | ||
| local managed_gpu_marker="/opt/azure/containers/managed-gpu-experience.enabled" | ||
| if [ "${ENABLE_MANAGED_GPU_EXPERIENCE}" = "true" ]; then | ||
| if [ "${ENABLE_MANAGED_GPU_EXPERIENCE}" = "true" ] || [ "${ENABLE_MANAGED_GPU_EXPERIENCE_DRA}" = "true" ]; then | ||
| logs_to_events "AKS.CSE.installNvidiaManagedExpPkgFromCache" "installNvidiaManagedExpPkgFromCache" || exit $ERR_NVIDIA_DCGM_INSTALL | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is all code duplication from the previous IF section, please clean up |
||
| logs_to_events "AKS.CSE.startNvidiaManagedExpServices" "startNvidiaManagedExpServices" || exit $ERR_NVIDIA_DCGM_EXPORTER_FAIL | ||
| # defer startNvidiaManagedExpServices() after kubelet starts | ||
| addKubeletNodeLabel "kubernetes.azure.com/dcgm-exporter=enabled" | ||
| mkdir -p "$(dirname "${managed_gpu_marker}")" | ||
|
runzhen marked this conversation as resolved.
|
||
| touch "${managed_gpu_marker}" | ||
| else | ||
| # EnableManagedGPUExperience is mutable, so services may have been | ||
| # installed on a previous CSE run. Stop them if they exist. | ||
| logs_to_events "AKS.CSE.stop.nvidia-device-plugin" "systemctlDisableAndStop nvidia-device-plugin" | ||
| if [ "${ENABLE_MANAGED_GPU_EXPERIENCE}" = "true" ]; then | ||
| logs_to_events "AKS.CSE.stop.nvidia-device-plugin" "systemctlDisableAndStop nvidia-device-plugin" | ||
| fi | ||
| if [ "${ENABLE_MANAGED_GPU_EXPERIENCE_DRA}" = "true" ]; then | ||
| logs_to_events "AKS.CSE.stop.dra-driver-nvidia-gpu" "systemctlDisableAndStop dra-driver-nvidia-gpu" | ||
| fi | ||
|
runzhen marked this conversation as resolved.
Outdated
|
||
| logs_to_events "AKS.CSE.stop.nvidia-dcgm" "systemctlDisableAndStop nvidia-dcgm" | ||
| logs_to_events "AKS.CSE.stop.nvidia-dcgm-exporter" "systemctlDisableAndStop nvidia-dcgm-exporter" | ||
| rm -f "${managed_gpu_marker}" | ||
|
|
@@ -1844,52 +1849,74 @@ configureManagedGPUExperience() { | |
|
|
||
| startNvidiaManagedExpServices() { | ||
| # 1. Start the nvidia-device-plugin service. | ||
| # Create systemd override directory to configure device plugin | ||
| NVIDIA_DEVICE_PLUGIN_OVERRIDE_DIR="/etc/systemd/system/nvidia-device-plugin.service.d" | ||
| mkdir -p "${NVIDIA_DEVICE_PLUGIN_OVERRIDE_DIR}" | ||
|
|
||
| if [ "${MIG_NODE}" = "true" ]; then | ||
| # Configure with MIG strategy for MIG nodes. | ||
| # MIG strategy controls how nvidia-device-plugin exposes MIG instances to Kubernetes: | ||
| # - "single": All MIG devices exposed as generic nvidia.com/gpu resources | ||
| # - "mixed": MIG devices exposed with specific types like nvidia.com/mig-1g.5gb | ||
| # | ||
| # We only use "mixed" when explicitly specified via NVIDIA_MIG_STRATEGY. | ||
| # Otherwise, we default to "single" which is the safer/simpler option. | ||
| # Note: NVIDIA_MIG_STRATEGY values from RP are "None", "Single", "Mixed". | ||
| # "None" and "Single" both result in using the "single" strategy. | ||
| if [ "${NVIDIA_MIG_STRATEGY}" = "Mixed" ]; then | ||
| MIG_STRATEGY_FLAG="--mig-strategy mixed" | ||
| else | ||
| # Default to "single" for "Single", "None", empty, or any other value | ||
| MIG_STRATEGY_FLAG="--mig-strategy single" | ||
| fi | ||
| if [ "${ENABLE_MANAGED_GPU_EXPERIENCE}" = "true" ]; then | ||
| # Create systemd override directory to configure device plugin | ||
| NVIDIA_DEVICE_PLUGIN_OVERRIDE_DIR="/etc/systemd/system/nvidia-device-plugin.service.d" | ||
| mkdir -p "${NVIDIA_DEVICE_PLUGIN_OVERRIDE_DIR}" | ||
|
|
||
| if [ "${MIG_NODE}" = "true" ]; then | ||
| # Configure with MIG strategy for MIG nodes. | ||
| # MIG strategy controls how nvidia-device-plugin exposes MIG instances to Kubernetes: | ||
| # - "single": All MIG devices exposed as generic nvidia.com/gpu resources | ||
| # - "mixed": MIG devices exposed with specific types like nvidia.com/mig-1g.5gb | ||
| # | ||
| # We only use "mixed" when explicitly specified via NVIDIA_MIG_STRATEGY. | ||
| # Otherwise, we default to "single" which is the safer/simpler option. | ||
| # Note: NVIDIA_MIG_STRATEGY values from RP are "None", "Single", "Mixed". | ||
| # "None" and "Single" both result in using the "single" strategy. | ||
| if [ "${NVIDIA_MIG_STRATEGY}" = "Mixed" ]; then | ||
| MIG_STRATEGY_FLAG="--mig-strategy mixed" | ||
| else | ||
| # Default to "single" for "Single", "None", empty, or any other value | ||
| MIG_STRATEGY_FLAG="--mig-strategy single" | ||
| fi | ||
|
|
||
| tee "${NVIDIA_DEVICE_PLUGIN_OVERRIDE_DIR}/10-device-plugin-config.conf" > /dev/null <<EOF | ||
| tee "${NVIDIA_DEVICE_PLUGIN_OVERRIDE_DIR}/10-device-plugin-config.conf" > /dev/null <<EOF | ||
| [Service] | ||
| ExecStart= | ||
| ExecStart=/usr/bin/nvidia-device-plugin ${MIG_STRATEGY_FLAG} --pass-device-specs | ||
| EOF | ||
| else | ||
| # Configure with pass-device-specs for non-MIG nodes | ||
| tee "${NVIDIA_DEVICE_PLUGIN_OVERRIDE_DIR}/10-device-plugin-config.conf" > /dev/null <<'EOF' | ||
| else | ||
| # Configure with pass-device-specs for non-MIG nodes | ||
| tee "${NVIDIA_DEVICE_PLUGIN_OVERRIDE_DIR}/10-device-plugin-config.conf" > /dev/null <<'EOF' | ||
| [Service] | ||
| ExecStart= | ||
| ExecStart=/usr/bin/nvidia-device-plugin --pass-device-specs | ||
| EOF | ||
| fi | ||
|
|
||
| # Reload systemd to pick up the override | ||
| systemctl daemon-reload | ||
|
|
||
| logs_to_events "AKS.CSE.start.nvidia-device-plugin" "systemctlEnableAndStart nvidia-device-plugin 30" || exit $ERR_GPU_DEVICE_PLUGIN_START_FAIL | ||
| fi | ||
|
|
||
| # Reload systemd to pick up the override | ||
| systemctl daemon-reload | ||
| # 2. Start the dra-driver-nvidia-gpu service. | ||
| if [ "${ENABLE_MANAGED_GPU_EXPERIENCE_DRA}" = "true" ]; then | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same since both ENABLE_MANAGED_GPU_EXPERIENCE and ENABLE_MANAGED_GPU_EXPERIENCE_DRA cannot be true at the same time I would make it clear from the code base. within this function, right now it looks like both can be enabled independantly. |
||
| DRA_DRIVER_NVIDIA_GPU_OVERRIDE_DIR="/etc/systemd/system/dra-driver-nvidia-gpu.service.d" | ||
| mkdir -p "${DRA_DRIVER_NVIDIA_GPU_OVERRIDE_DIR}" | ||
|
|
||
| logs_to_events "AKS.CSE.start.nvidia-device-plugin" "systemctlEnableAndStart nvidia-device-plugin 30" || exit $ERR_GPU_DEVICE_PLUGIN_START_FAIL | ||
| tee "${DRA_DRIVER_NVIDIA_GPU_OVERRIDE_DIR}/10-dra-driver-nvidia-gpu.conf" > /dev/null <<EOF | ||
| [Unit] | ||
| Requires=kubelet.service | ||
| After=kubelet.service | ||
| [Service] | ||
| ExecStart= | ||
| ExecStart=/usr/bin/gpu-kubelet-plugin --kubeconfig /var/lib/kubelet/kubeconfig --container-driver-root / --image-name "" --node-name=${NODE_NAME} | ||
| EOF | ||
|
|
||
| # Reload systemd to pick up the override | ||
| systemctl daemon-reload | ||
|
|
||
| logs_to_events "AKS.CSE.start.dra-driver-nvidia-gpu" "systemctlEnableAndStart dra-driver-nvidia-gpu 30" || exit $ERR_DRA_DRIVER_START_FAIL | ||
| fi | ||
|
|
||
| # 2. Start the nvidia-dcgm service. | ||
| # 3. Start the nvidia-dcgm service. | ||
| # DCGM is monitoring/telemetry and does not gate GPU workload scheduling, so start it without | ||
| # blocking node provisioning and treat a slow/failed start as non-fatal. | ||
| logs_to_events "AKS.CSE.start.nvidia-dcgm" "systemctlEnableAndStartNoBlock nvidia-dcgm 30" || echo "warning: nvidia-dcgm could not be enqueued; GPU monitoring will start asynchronously" | ||
|
|
||
| # 3. Start the nvidia-dcgm-exporter service. | ||
| # 4. Start the nvidia-dcgm-exporter service. | ||
| # Create systemd drop-in directory for nvidia-dcgm-exporter service | ||
| DCGM_EXPORTER_OVERRIDE_DIR="/etc/systemd/system/nvidia-dcgm-exporter.service.d" | ||
| mkdir -p "${DCGM_EXPORTER_OVERRIDE_DIR}" | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -418,12 +418,20 @@ isPackageInstalled() { | |
| } | ||
|
|
||
| managedGPUPackageList() { | ||
| packages=( | ||
| nvidia-device-plugin | ||
| local packages=( | ||
| datacenter-gpu-manager-4-core | ||
| datacenter-gpu-manager-4-proprietary | ||
| dcgm-exporter | ||
| ) | ||
|
runzhen marked this conversation as resolved.
|
||
|
|
||
| if [ "${ENABLE_MANAGED_GPU_EXPERIENCE:-false}" = "true" ]; then | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why this change ? my understand was that
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok I get it now, only one of the two can be enabled at the same time. It would be clearer is this was a switch case or if/else since this would make it clearer that it's one of the other. |
||
| packages+=(nvidia-device-plugin) | ||
| fi | ||
|
|
||
| if [ "${ENABLE_MANAGED_GPU_EXPERIENCE_DRA:-false}" = "true" ]; then | ||
|
runzhen marked this conversation as resolved.
|
||
| packages+=(dra-driver-nvidia-gpu) | ||
| fi | ||
|
|
||
| echo "${packages[@]}" | ||
|
runzhen marked this conversation as resolved.
|
||
| } | ||
|
|
||
|
|
@@ -435,6 +443,8 @@ installNvidiaManagedExpPkgFromCache() { | |
|
|
||
| # Ensure kubelet device-plugins directory exists BEFORE package installation | ||
| mkdir -p /var/lib/kubelet/device-plugins | ||
| mkdir -p /var/lib/kubelet/plugins_registry | ||
| mkdir -p /var/lib/kubelet/plugins | ||
|
|
||
| for packageName in $(managedGPUPackageList); do | ||
| downloadDir="$(getPackageDownloadDir "${packageName}")" | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not supported on AZL ?