From b2b087bf1d5a4185c8fe27e654e5253392f442e9 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Tue, 30 Jun 2026 16:11:22 -0700 Subject: [PATCH 1/2] feat(gpu): enable CUDA driver prebake on shared Ubuntu gen2 VHD builds Turn on the NVIDIA_CUDA_PREBAKE feature flag (added dark in #8786) for the two shared x86 gen2 Ubuntu images that GPU CUDA nodes boot -- 2204gen2containerd and 2404gen2containerd -- in both the release (.vsts-vhd-builder-release.yaml) and PR/test (.vsts-vhd-builder.yaml) VHD pipelines. With the flag set, install-dependencies.sh pre-builds the NVIDIA CUDA kernel module into the VHD at build time, so GPU nodes skip the ~80-150s DKMS compile at boot. Non-GPU and --gpu-driver None nodes tear the module down during provisioning (cleanUpPrebakedGPUDriver, also from #8786), so the shared image carries no extra attack surface on those nodes. Scope rationale: - Only 22.04/24.04 gen2 x86: these are the images GPU CUDA SKUs (A10/A100/H100) boot, confirmed via e2e GPU scenarios that pin VHDUbuntu2204Gen2Containerd / VHDUbuntu2404Gen2Containerd. AzureLinux GPU is out of scope (the bake is Ubuntu-only); gen1/FIPS/TL/arm64 are not used by supported CUDA GPU SKUs. - The Copy CIS Reports step keys off an exact-match FEATURE_FLAGS allowlist, so NVIDIA_CUDA_PREBAKE is added to that list to preserve report publishing. Signed-off-by: Ganeshkumar Ashokavardhanan --- .pipelines/.vsts-vhd-builder-release.yaml | 8 ++++++-- .pipelines/.vsts-vhd-builder.yaml | 8 ++++++-- .pipelines/templates/.builder-release-template.yaml | 2 +- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/.pipelines/.vsts-vhd-builder-release.yaml b/.pipelines/.vsts-vhd-builder-release.yaml index 3ff9671a1fd..dd6a6ca201a 100644 --- a/.pipelines/.vsts-vhd-builder-release.yaml +++ b/.pipelines/.vsts-vhd-builder-release.yaml @@ -611,7 +611,9 @@ stages: echo '##vso[task.setvariable variable=IMG_VERSION]latest' echo '##vso[task.setvariable variable=HYPERV_GENERATION]V2' echo '##vso[task.setvariable variable=AZURE_VM_SIZE]Standard_D16ds_v5' - echo '##vso[task.setvariable variable=FEATURE_FLAGS]None' + # NVIDIA_CUDA_PREBAKE bakes the CUDA driver kernel module into this shared x86 gen2 image so GPU + # nodes skip the ~80-150s DKMS compile at boot; non-GPU nodes tear it down during provisioning. + echo '##vso[task.setvariable variable=FEATURE_FLAGS]NVIDIA_CUDA_PREBAKE' echo '##vso[task.setvariable variable=ARCHITECTURE]X86_64' echo '##vso[task.setvariable variable=ENABLE_FIPS]False' echo '##vso[task.setvariable variable=ENABLE_TRUSTED_LAUNCH]False' @@ -683,7 +685,9 @@ stages: echo '##vso[task.setvariable variable=IMG_VERSION]latest' echo '##vso[task.setvariable variable=HYPERV_GENERATION]V2' echo '##vso[task.setvariable variable=AZURE_VM_SIZE]Standard_D16ds_v5' - echo '##vso[task.setvariable variable=FEATURE_FLAGS]None' + # NVIDIA_CUDA_PREBAKE bakes the CUDA driver kernel module into this shared x86 gen2 image so GPU + # nodes skip the ~80-150s DKMS compile at boot; non-GPU nodes tear it down during provisioning. + echo '##vso[task.setvariable variable=FEATURE_FLAGS]NVIDIA_CUDA_PREBAKE' echo '##vso[task.setvariable variable=ARCHITECTURE]X86_64' echo '##vso[task.setvariable variable=ENABLE_FIPS]False' echo '##vso[task.setvariable variable=ENABLE_TRUSTED_LAUNCH]False' diff --git a/.pipelines/.vsts-vhd-builder.yaml b/.pipelines/.vsts-vhd-builder.yaml index e556a651b49..0383d9adadd 100644 --- a/.pipelines/.vsts-vhd-builder.yaml +++ b/.pipelines/.vsts-vhd-builder.yaml @@ -83,7 +83,9 @@ stages: echo '##vso[task.setvariable variable=IMG_VERSION]latest' echo '##vso[task.setvariable variable=HYPERV_GENERATION]V2' echo '##vso[task.setvariable variable=AZURE_VM_SIZE]Standard_D16ds_v5' - echo '##vso[task.setvariable variable=FEATURE_FLAGS]None' + # NVIDIA_CUDA_PREBAKE bakes the CUDA driver kernel module into this shared x86 gen2 image so GPU + # nodes skip the ~80-150s DKMS compile at boot; non-GPU nodes tear it down during provisioning. + echo '##vso[task.setvariable variable=FEATURE_FLAGS]NVIDIA_CUDA_PREBAKE' echo '##vso[task.setvariable variable=ARCHITECTURE]X86_64' echo '##vso[task.setvariable variable=ENABLE_FIPS]False' echo '##vso[task.setvariable variable=ENABLE_TRUSTED_LAUNCH]False' @@ -104,7 +106,9 @@ stages: echo '##vso[task.setvariable variable=IMG_VERSION]latest' echo '##vso[task.setvariable variable=HYPERV_GENERATION]V2' echo '##vso[task.setvariable variable=AZURE_VM_SIZE]Standard_D16ds_v5' - echo '##vso[task.setvariable variable=FEATURE_FLAGS]None' + # NVIDIA_CUDA_PREBAKE bakes the CUDA driver kernel module into this shared x86 gen2 image so GPU + # nodes skip the ~80-150s DKMS compile at boot; non-GPU nodes tear it down during provisioning. + echo '##vso[task.setvariable variable=FEATURE_FLAGS]NVIDIA_CUDA_PREBAKE' echo '##vso[task.setvariable variable=ARCHITECTURE]X86_64' echo '##vso[task.setvariable variable=ENABLE_FIPS]false' echo '##vso[task.setvariable variable=ENABLE_TRUSTED_LAUNCH]False' diff --git a/.pipelines/templates/.builder-release-template.yaml b/.pipelines/templates/.builder-release-template.yaml index 3f8e59bb583..6c5195359dc 100644 --- a/.pipelines/templates/.builder-release-template.yaml +++ b/.pipelines/templates/.builder-release-template.yaml @@ -353,7 +353,7 @@ steps: TargetFolder: '$(Build.ArtifactStagingDirectory)' - task: CopyFiles@2 - condition: and(eq(variables.OS_SKU, 'Ubuntu'), in(variables.OS_VERSION, '22.04', '24.04'), in(variables.FEATURE_FLAGS, 'None', 'cvm', 'NVIDIA_GB')) + condition: and(eq(variables.OS_SKU, 'Ubuntu'), in(variables.OS_VERSION, '22.04', '24.04'), in(variables.FEATURE_FLAGS, 'None', 'cvm', 'NVIDIA_GB', 'NVIDIA_CUDA_PREBAKE')) displayName: Copy CIS Reports inputs: SourceFolder: '$(System.DefaultWorkingDirectory)' From 9a42032f9e8ddd17abc64d94d50ede5527527161 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Tue, 30 Jun 2026 16:19:39 -0700 Subject: [PATCH 2/2] docs(gpu): clarify prebake comment is a capability gated on the consume path Address review feedback: the inline rationale said GPU nodes "skip the DKMS compile at boot", but this PR only enables the bake -- the boot-time skip requires the configGPUDrivers skip-build path (PR #8787), which is not yet in main. Reword to "can later skip ... via the configGPUDrivers skip-build path" so pipeline maintainers aren't misled, and note the teardown covers non-GPU and --gpu-driver None nodes. Signed-off-by: Ganeshkumar Ashokavardhanan --- .pipelines/.vsts-vhd-builder-release.yaml | 6 ++++-- .pipelines/.vsts-vhd-builder.yaml | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.pipelines/.vsts-vhd-builder-release.yaml b/.pipelines/.vsts-vhd-builder-release.yaml index dd6a6ca201a..ae7819d6f19 100644 --- a/.pipelines/.vsts-vhd-builder-release.yaml +++ b/.pipelines/.vsts-vhd-builder-release.yaml @@ -612,7 +612,8 @@ stages: echo '##vso[task.setvariable variable=HYPERV_GENERATION]V2' echo '##vso[task.setvariable variable=AZURE_VM_SIZE]Standard_D16ds_v5' # NVIDIA_CUDA_PREBAKE bakes the CUDA driver kernel module into this shared x86 gen2 image so GPU - # nodes skip the ~80-150s DKMS compile at boot; non-GPU nodes tear it down during provisioning. + # nodes can later skip the ~80-150s in-CSE DKMS compile via the configGPUDrivers skip-build path + # (PR #8787); non-GPU and --gpu-driver None nodes tear the module down during provisioning. echo '##vso[task.setvariable variable=FEATURE_FLAGS]NVIDIA_CUDA_PREBAKE' echo '##vso[task.setvariable variable=ARCHITECTURE]X86_64' echo '##vso[task.setvariable variable=ENABLE_FIPS]False' @@ -686,7 +687,8 @@ stages: echo '##vso[task.setvariable variable=HYPERV_GENERATION]V2' echo '##vso[task.setvariable variable=AZURE_VM_SIZE]Standard_D16ds_v5' # NVIDIA_CUDA_PREBAKE bakes the CUDA driver kernel module into this shared x86 gen2 image so GPU - # nodes skip the ~80-150s DKMS compile at boot; non-GPU nodes tear it down during provisioning. + # nodes can later skip the ~80-150s in-CSE DKMS compile via the configGPUDrivers skip-build path + # (PR #8787); non-GPU and --gpu-driver None nodes tear the module down during provisioning. echo '##vso[task.setvariable variable=FEATURE_FLAGS]NVIDIA_CUDA_PREBAKE' echo '##vso[task.setvariable variable=ARCHITECTURE]X86_64' echo '##vso[task.setvariable variable=ENABLE_FIPS]False' diff --git a/.pipelines/.vsts-vhd-builder.yaml b/.pipelines/.vsts-vhd-builder.yaml index 0383d9adadd..e4ecb102a1f 100644 --- a/.pipelines/.vsts-vhd-builder.yaml +++ b/.pipelines/.vsts-vhd-builder.yaml @@ -84,7 +84,8 @@ stages: echo '##vso[task.setvariable variable=HYPERV_GENERATION]V2' echo '##vso[task.setvariable variable=AZURE_VM_SIZE]Standard_D16ds_v5' # NVIDIA_CUDA_PREBAKE bakes the CUDA driver kernel module into this shared x86 gen2 image so GPU - # nodes skip the ~80-150s DKMS compile at boot; non-GPU nodes tear it down during provisioning. + # nodes can later skip the ~80-150s in-CSE DKMS compile via the configGPUDrivers skip-build path + # (PR #8787); non-GPU and --gpu-driver None nodes tear the module down during provisioning. echo '##vso[task.setvariable variable=FEATURE_FLAGS]NVIDIA_CUDA_PREBAKE' echo '##vso[task.setvariable variable=ARCHITECTURE]X86_64' echo '##vso[task.setvariable variable=ENABLE_FIPS]False' @@ -107,7 +108,8 @@ stages: echo '##vso[task.setvariable variable=HYPERV_GENERATION]V2' echo '##vso[task.setvariable variable=AZURE_VM_SIZE]Standard_D16ds_v5' # NVIDIA_CUDA_PREBAKE bakes the CUDA driver kernel module into this shared x86 gen2 image so GPU - # nodes skip the ~80-150s DKMS compile at boot; non-GPU nodes tear it down during provisioning. + # nodes can later skip the ~80-150s in-CSE DKMS compile via the configGPUDrivers skip-build path + # (PR #8787); non-GPU and --gpu-driver None nodes tear the module down during provisioning. echo '##vso[task.setvariable variable=FEATURE_FLAGS]NVIDIA_CUDA_PREBAKE' echo '##vso[task.setvariable variable=ARCHITECTURE]X86_64' echo '##vso[task.setvariable variable=ENABLE_FIPS]false'