From 129990cf4194200d067c23242afd524b95f52278 Mon Sep 17 00:00:00 2001 From: Wen Huang Date: Thu, 2 Jul 2026 22:53:18 +0000 Subject: [PATCH] feat(linux): add AMD MI300X ROCm bootstrap --- aks-node-controller/parser/helper.go | 7 + aks-node-controller/parser/parser.go | 1 + aks-node-controller/parser/parser_test.go | 17 ++ parts/linux/cloud-init/artifacts/cse_cmd.sh | 1 + .../linux/cloud-init/artifacts/cse_helpers.sh | 4 + parts/linux/cloud-init/artifacts/cse_main.sh | 10 +- .../artifacts/ubuntu/cse_install_ubuntu.sh | 180 ++++++++++++++++++ pkg/agent/baker_test.go | 14 ++ pkg/agent/variables.go | 1 + pkg/agent/variables_test.go | 12 ++ vhdbuilder/packer/install-dependencies.sh | 9 + .../packer/test/linux-vhd-content-test.sh | 83 ++++++++ .../linux/ubuntu/tool_installs_ubuntu.sh | 116 +++++++++++ 13 files changed, 453 insertions(+), 2 deletions(-) diff --git a/aks-node-controller/parser/helper.go b/aks-node-controller/parser/helper.go index 168cfc48a80..4a1e102e149 100644 --- a/aks-node-controller/parser/helper.go +++ b/aks-node-controller/parser/helper.go @@ -728,6 +728,13 @@ func getEnableNvidia(config *aksnodeconfigv1.Configuration) bool { return false } +func getEnableAmdGpu(config *aksnodeconfigv1.Configuration) bool { + if config.GpuConfig != nil && config.GpuConfig.EnableAmdGpu != nil { + return *config.GpuConfig.EnableAmdGpu + } + return false +} + func removeNewlines(str string) string { sanitizedStr := strings.ReplaceAll(str, "\n", "") sanitizedStr = strings.ReplaceAll(sanitizedStr, "\r", "") diff --git a/aks-node-controller/parser/parser.go b/aks-node-controller/parser/parser.go index 0eebd3e542b..c925ea05ae9 100644 --- a/aks-node-controller/parser/parser.go +++ b/aks-node-controller/parser/parser.go @@ -81,6 +81,7 @@ func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string { "API_SERVER_NAME": config.GetApiServerConfig().GetApiServerName(), "IS_VHD": fmt.Sprintf("%v", getIsVHD(config.IsVhd)), "GPU_NODE": fmt.Sprintf("%v", getEnableNvidia(config)), + "AMD_GPU_NODE": fmt.Sprintf("%v", getEnableAmdGpu(config)), "SGX_NODE": fmt.Sprintf("%v", getIsSgxEnabledSKU(config.GetVmSize())), "MIG_NODE": fmt.Sprintf("%v", getIsMIGNode(config.GetGpuConfig().GetGpuInstanceProfile())), "CONFIG_GPU_DRIVER_IF_NEEDED": fmt.Sprintf("%v", config.GetGpuConfig().GetConfigGpuDriver()), diff --git a/aks-node-controller/parser/parser_test.go b/aks-node-controller/parser/parser_test.go index d43ece17486..4ec49abc693 100644 --- a/aks-node-controller/parser/parser_test.go +++ b/aks-node-controller/parser/parser_test.go @@ -40,6 +40,7 @@ func TestBuildCSECmd(t *testing.T) { validator: func(cmd *exec.Cmd) { vars := environToMap(cmd.Env) assert.Equal(t, "false", vars["GPU_NODE"]) + assert.Equal(t, "false", vars["AMD_GPU_NODE"]) assert.NotEmpty(t, vars["CONTAINERD_CONFIG_NO_GPU_CONTENT"]) // Ensure the containerd config does not use the // nvidia container runtime when skipping the @@ -69,6 +70,21 @@ oom_score = -999 require.Equal(t, expectedShimConfig, containerdConfigFileContent) }, }, + { + name: "AKSUbuntu2204 containerd with AMD GPU", + folder: "AKSUbuntu2204+Containerd+MIG", + k8sVersion: "1.19.13", + aksNodeConfigUpdator: func(aksNodeConfig *aksnodeconfigv1.Configuration) { + aksNodeConfig.GpuConfig.EnableNvidia = to.Ptr(false) + aksNodeConfig.GpuConfig.EnableAmdGpu = to.Ptr(true) + aksNodeConfig.VmSize = "Standard_ND96isr_MI300X_v5" + }, + validator: func(cmd *exec.Cmd) { + vars := environToMap(cmd.Env) + assert.Equal(t, "false", vars["GPU_NODE"]) + assert.Equal(t, "true", vars["AMD_GPU_NODE"]) + }, + }, { name: "AKSUbuntu2204 DisableSSH with enabled ssh", folder: "AKSUbuntu2204+SSHStatusOn", @@ -452,6 +468,7 @@ func TestAKSNodeConfigCompatibilityFromJsonToCSECommand(t *testing.T) { assertHasKeyWithValue(t, vars, "NETWORK_PLUGIN", "") assertHasKeyWithValue(t, vars, "VNET_CNI_PLUGINS_URL", "") assertHasKeyWithValue(t, vars, "GPU_NODE", "false") + assertHasKeyWithValue(t, vars, "AMD_GPU_NODE", "false") assertHasKeyWithValue(t, vars, "GPU_INSTANCE_PROFILE", "") assertHasKeyWithValue(t, vars, "CUSTOM_CA_TRUST_COUNT", "0") assertHasKeyWithValue(t, vars, "SHOULD_CONFIGURE_CUSTOM_CA_TRUST", "false") diff --git a/parts/linux/cloud-init/artifacts/cse_cmd.sh b/parts/linux/cloud-init/artifacts/cse_cmd.sh index 52ffb72de76..b56cc286d4e 100644 --- a/parts/linux/cloud-init/artifacts/cse_cmd.sh +++ b/parts/linux/cloud-init/artifacts/cse_cmd.sh @@ -74,6 +74,7 @@ IDENTITY_BINDINGS_LOCAL_AUTHORITY_SNI={{GetVariable "identityBindingsLocalAuthor API_SERVER_NAME={{GetKubernetesEndpoint}} IS_VHD={{GetVariable "isVHD"}} GPU_NODE={{GetVariable "gpuNode"}} +AMD_GPU_NODE={{GetVariable "amdGpuNode"}} SGX_NODE={{GetVariable "sgxNode"}} MIG_NODE={{GetVariable "migNode"}} CONFIG_GPU_DRIVER_IF_NEEDED={{GetVariable "configGPUDriverIfNeeded"}} diff --git a/parts/linux/cloud-init/artifacts/cse_helpers.sh b/parts/linux/cloud-init/artifacts/cse_helpers.sh index c438e0610d5..ba8a1175505 100755 --- a/parts/linux/cloud-init/artifacts/cse_helpers.sh +++ b/parts/linux/cloud-init/artifacts/cse_helpers.sh @@ -161,6 +161,10 @@ ERR_AKS_NODE_CONTROLLER_ERROR=240 # Generic error in AKS Node Controller ERR_AZNFS_RPM_DOWNLOAD_TIMEOUT=241 # Timeout downloading aznfs RPM from PMC ERR_AZNFS_INSTALL_FAIL=242 # Failed to install aznfs RPM package ERR_SECONDARY_NIC_CONFIG_FAIL=243 # Error configuring secondary NIC network interface +ERR_AMD_ROCM_UNSUPPORTED_OS=244 # AMD ROCm CSE install is only supported on Ubuntu 24.04 amd64 +ERR_AMD_ROCM_GPG_KEY_DOWNLOAD_TIMEOUT=245 # Timeout waiting for AMD ROCm GPG key download +ERR_AMD_ROCM_INSTALL_TIMEOUT=246 # Timeout waiting for AMD ROCm package install +ERR_AMD_ROCM_VALIDATE_FAIL=247 # AMD ROCm CSE validation failed # ----------------------------------------------------------------------------- # This probably wasn't launched via a login shell, so ensure the PATH is correct. diff --git a/parts/linux/cloud-init/artifacts/cse_main.sh b/parts/linux/cloud-init/artifacts/cse_main.sh index 59082fd5287..7daa92ebbd2 100755 --- a/parts/linux/cloud-init/artifacts/cse_main.sh +++ b/parts/linux/cloud-init/artifacts/cse_main.sh @@ -423,8 +423,14 @@ function nodePrep { # By default, never reboot new nodes. REBOOTREQUIRED=false - # Install and configure GPU drivers if this is a GPU node - if [ "${GPU_NODE}" = "true" ] && [ "${skip_nvidia_driver_install}" != "true" ]; then + # Install and configure AMD GPU drivers if this is an AMD GPU node. + if [ "${AMD_GPU_NODE}" = "true" ]; then + echo $(date),$(hostname), "Start configuring AMD GPU drivers" + logs_to_events "AKS.CSE.ensureAmdGpuDrivers" ensureAmdGpuDrivers + echo $(date),$(hostname), "End configuring AMD GPU drivers" + + # Install and configure NVIDIA GPU drivers if this is an NVIDIA GPU node. + elif [ "${GPU_NODE}" = "true" ] && [ "${skip_nvidia_driver_install}" != "true" ]; then echo $(date),$(hostname), "Start configuring GPU drivers" # Install GPU drivers diff --git a/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh b/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh index 602ea0f89e2..6f62282c2b9 100755 --- a/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh +++ b/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh @@ -223,6 +223,186 @@ removeNvidiaRepos() { fi } +amdRocmUbuntuRelease() { + if [ -n "${UBUNTU_RELEASE:-}" ]; then + echo "${UBUNTU_RELEASE}" + return 0 + fi + . /etc/os-release + echo "${VERSION_ID}" +} + +amdRocmUbuntuCodename() { + if [ -n "${UBUNTU_CODENAME:-}" ]; then + echo "${UBUNTU_CODENAME}" + return 0 + fi + . /etc/os-release + echo "${VERSION_CODENAME}" +} + +isAmdRocmSupportedSku() { + local vm_sku + vm_sku="$(get_compute_sku 2>/dev/null || true)" + case "${vm_sku,,}" in + standard_nd96isr_mi300x_v5|standard_nd96is_mi300x_v5) + return 0 + ;; + esac + echo "AMD ROCm CSE install is not supported for VM SKU '${vm_sku}'" + return 1 +} + +setupAmdRocmAptRepos() { + local rocm_version="${1}" + local amdgpu_repo_version="${2}" + local rocm_gpg_keyring_path="/etc/apt/keyrings/rocm.gpg" + local rocm_gpg_key_download_path="/tmp/rocm.gpg.key" + local ubuntu_codename + ubuntu_codename="$(amdRocmUbuntuCodename)" + + if [ -n "${PROXY_VARS}" ]; then + eval "${PROXY_VARS}" + fi + + if ! command -v gpg >/dev/null 2>&1; then + apt_get_install 30 1 300 gnupg || exit $ERR_AMD_ROCM_INSTALL_TIMEOUT + fi + + mkdir -p "$(dirname "${rocm_gpg_keyring_path}")" + retrycmd_curl_file 120 5 25 "${rocm_gpg_key_download_path}" "https://repo.radeon.com/rocm/rocm.gpg.key" 300 || exit $ERR_AMD_ROCM_GPG_KEY_DOWNLOAD_TIMEOUT + gpg --dearmor --yes -o "${rocm_gpg_keyring_path}" "${rocm_gpg_key_download_path}" || exit $ERR_AMD_ROCM_GPG_KEY_DOWNLOAD_TIMEOUT + rm -f "${rocm_gpg_key_download_path}" + + cat > /etc/apt/sources.list.d/rocm.list < /etc/apt/sources.list.d/amdgpu.list < /etc/apt/preferences.d/repo-radeon-pin-600 </dev/null 2>&1; then + command -v "${binary_name}" + return 0 + fi + if [ -x "/opt/rocm/bin/${binary_name}" ]; then + echo "/opt/rocm/bin/${binary_name}" + return 0 + fi + return 1 +} + +ensureAmdRocmModuleAutoload() { + mkdir -p /etc/modules-load.d + for modprobe_conf in /etc/modprobe.d/*.conf; do + [ -f "${modprobe_conf}" ] || continue + sed -i '/^[[:space:]]*blacklist[[:space:]]\+amdgpu\([[:space:]]\|$\)/d' "${modprobe_conf}" + sed -i '/^[[:space:]]*install[[:space:]]\+amdgpu[[:space:]]\+\/bin\/false\([[:space:]]\|$\)/d' "${modprobe_conf}" + done + printf '%s\n' amdgpu > /etc/modules-load.d/amdgpu.conf +} + +validateAmdRocmDriver() { + local kernel_version + local rocminfo_bin + local rocm_smi_bin + kernel_version="$(uname -r)" + + for package_name in amdgpu-dkms libdrm-amdgpu-dev rocm-core rocminfo rocm-smi-lib; do + dpkg-query -W "${package_name}" >/dev/null 2>&1 || return 1 + done + + dkms status amdgpu | grep -q "${kernel_version}.*installed" || return 1 + modinfo amdgpu >/dev/null 2>&1 || return 1 + ! grep -qsE '^[[:space:]]*(blacklist[[:space:]]+amdgpu|install[[:space:]]+amdgpu[[:space:]]+/bin/false)([[:space:]]|$)' /etc/modprobe.d/*.conf 2>/dev/null || return 1 + grep -qx amdgpu /etc/modules-load.d/amdgpu.conf || return 1 + retrycmd_if_failure 12 5 30 modprobe amdgpu || return 1 + retrycmd_if_failure 12 5 5 test -e /dev/kfd || return 1 + retrycmd_if_failure 12 5 5 bash -c "find /dev/dri -maxdepth 1 -name 'renderD*' -print -quit | grep -q ." || return 1 + + rocminfo_bin="$(amdRocmBinaryPath rocminfo)" || return 1 + rocm_smi_bin="$(amdRocmBinaryPath rocm-smi)" || return 1 + timeout 60 "${rocminfo_bin}" >/tmp/amd-rocminfo.out 2>&1 || return 1 + grep -q "gfx942" /tmp/amd-rocminfo.out || return 1 + timeout 60 "${rocm_smi_bin}" --showproductname >/tmp/amd-rocm-smi.out 2>&1 || return 1 + grep -q "AMD Instinct MI300X VF" /tmp/amd-rocm-smi.out || return 1 +} + +ensureAmdGpuDrivers() { + local rocm_version="${AMD_ROCM_VERSION:-7.2.4}" + local amdgpu_repo_version="${AMD_ROCM_AMDGPU_REPO_VERSION:-30.30.4}" + local amdgpu_dkms_version="${AMD_ROCM_AMDGPU_DKMS_VERSION:-1:6.16.13.30300400-2341068.24.04}" + local libdrm_amdgpu_dev_version="${AMD_ROCM_LIBDRM_AMDGPU_DEV_VERSION:-1:2.4.125.07020400-2341098.24.04}" + local rocm_package_version="${AMD_ROCM_PACKAGE_VERSION:-7.2.4.70204-93~24.04}" + local rocminfo_package_version="${AMD_ROCM_ROCMINFO_VERSION:-1.0.0.70204-93~24.04}" + local rocm_smi_lib_package_version="${AMD_ROCM_SMI_LIB_VERSION:-7.8.0.70204-93~24.04}" + local kernel_version + local ubuntu_release + kernel_version="$(uname -r)" + ubuntu_release="$(amdRocmUbuntuRelease)" + + if [ "${OS}" != "${UBUNTU_OS_NAME}" ] || [ "${ubuntu_release}" != "24.04" ] || [ "$(isARM64)" -eq 1 ]; then + echo "AMD ROCm CSE install is only supported on Ubuntu 24.04 amd64. Found OS=${OS}, Ubuntu=${ubuntu_release}, CPU_ARCH=$(getCPUArch)." + exit $ERR_AMD_ROCM_UNSUPPORTED_OS + fi + isAmdRocmSupportedSku || exit $ERR_AMD_ROCM_UNSUPPORTED_OS + ensureAmdRocmModuleAutoload + + if [ -f /opt/azure/amd-rocm/version ] && validateAmdRocmDriver; then + echo "AMD ROCm driver is already installed and validated" + return 0 + fi + + setupAmdRocmAptRepos "${rocm_version}" "${amdgpu_repo_version}" + + apt_get_install 30 1 600 "linux-headers-${kernel_version}" "linux-modules-extra-${kernel_version}" || exit $ERR_AMD_ROCM_INSTALL_TIMEOUT + apt_get_install 30 1 2400 \ + "amdgpu-dkms=${amdgpu_dkms_version}" \ + "libdrm-amdgpu-dev=${libdrm_amdgpu_dev_version}" \ + "rocm-core=${rocm_package_version}" \ + "rocminfo=${rocminfo_package_version}" \ + "rocm-smi-lib=${rocm_smi_lib_package_version}" || exit $ERR_AMD_ROCM_INSTALL_TIMEOUT + ldconfig || exit $ERR_AMD_ROCM_INSTALL_TIMEOUT + + mkdir -p /opt/azure/amd-rocm + cat > /opt/azure/amd-rocm/version <> ${VHD_LOGS_FILEPATH} + - amd-rocm-prebake=$(cat /opt/azure/amd-rocm/version | tr '\n' ' ') +EOF +fi +capture_benchmark "${SCRIPT_NAME}_install_amd_rocm_prebake" + if [ -d "/opt/gpu" ] && [ "$(ls -A /opt/gpu)" ]; then ls -ltr /opt/gpu/* >> ${VHD_LOGS_FILEPATH} fi diff --git a/vhdbuilder/packer/test/linux-vhd-content-test.sh b/vhdbuilder/packer/test/linux-vhd-content-test.sh index da466bdda25..8550a73ab89 100644 --- a/vhdbuilder/packer/test/linux-vhd-content-test.sh +++ b/vhdbuilder/packer/test/linux-vhd-content-test.sh @@ -2428,6 +2428,88 @@ testDiskQueueServiceIsActive() { echo "$test:Finish" } +testAmdRocmPrebake() { + local test="testAmdRocmPrebake" + echo "$test: Start" + + if ! echo "$FEATURE_FLAGS" | grep -q "AMD_ROCM"; then + echo "$test: Skipping - AMD_ROCM feature flag not set" + echo "$test: Finish" + return 0 + fi + + if [ "$OS_SKU" != "Ubuntu" ] || [ "$OS_VERSION" != "24.04" ]; then + err "$test" "AMD_ROCM should only be used with Ubuntu 24.04, got OS_SKU=$OS_SKU OS_VERSION=$OS_VERSION" + return 1 + fi + + if [ ! -f /opt/azure/amd-rocm/version ]; then + err "$test" "/opt/azure/amd-rocm/version marker is missing" + return 1 + fi + + for package_name in amdgpu-dkms rocm rocm-core rocminfo rocm-smi-lib; do + if ! dpkg-query -W "${package_name}" >/dev/null 2>&1; then + err "$test" "${package_name} package is not installed" + return 1 + fi + done + + if ! dkms status amdgpu | grep -q "$(uname -r).*installed"; then + err "$test" "amdgpu DKMS module is not installed for kernel $(uname -r)" + return 1 + fi + + if ! modinfo amdgpu >/dev/null 2>&1; then + err "$test" "amdgpu kernel module metadata is not available" + return 1 + fi + + if grep -qsE '^[[:space:]]*(blacklist[[:space:]]+amdgpu|install[[:space:]]+amdgpu[[:space:]]+/bin/false)([[:space:]]|$)' /etc/modprobe.d/*.conf 2>/dev/null; then + err "$test" "amdgpu is still disabled in /etc/modprobe.d" + return 1 + fi + + if ! grep -qx amdgpu /etc/modules-load.d/amdgpu.conf; then + err "$test" "amdgpu is not configured to load on boot" + return 1 + fi + + if ! command -v rocminfo >/dev/null 2>&1; then + err "$test" "rocminfo is not on PATH" + return 1 + fi + + if ! command -v rocm-smi >/dev/null 2>&1; then + err "$test" "rocm-smi is not on PATH" + return 1 + fi + + if [ ! -f /opt/rocm/.info/version ]; then + err "$test" "/opt/rocm/.info/version is missing" + return 1 + fi + + if grep -R "repo.radeon.com" /etc/apt/sources.list /etc/apt/sources.list.d /etc/apt/preferences /etc/apt/preferences.d >/dev/null 2>&1; then + err "$test" "repo.radeon.com apt source or pin is still present on the VHD" + return 1 + fi + + if [ -e /etc/apt/keyrings/rocm.gpg ]; then + err "$test" "/etc/apt/keyrings/rocm.gpg should be removed after VHD build" + return 1 + fi + + if find /var/lib/apt/lists -maxdepth 1 -name '*repo.radeon.com*' | grep -q .; then + err "$test" "repo.radeon.com apt package indexes should be removed after VHD build" + return 1 + fi + + echo "$test: AMD ROCm prebake marker: $(tr '\n' ' ' < /opt/azure/amd-rocm/version)" + echo "$test: Finish" + return 0 +} + testCNIPluginsInstalled() { local test="testCNIPluginsInstalled" echo "$test: Start" @@ -2551,5 +2633,6 @@ testInspektorGadgetAssets testPackageDownloadURLFallbackLogic testFileOwnership $OS_SKU testDiskQueueServiceIsActive +testAmdRocmPrebake testVulnerableKernelModulesDisabled $OS_SKU $OS_VERSION testArtifactStreamingPackagesCleanedUp diff --git a/vhdbuilder/scripts/linux/ubuntu/tool_installs_ubuntu.sh b/vhdbuilder/scripts/linux/ubuntu/tool_installs_ubuntu.sh index 807d67bae41..471151793de 100755 --- a/vhdbuilder/scripts/linux/ubuntu/tool_installs_ubuntu.sh +++ b/vhdbuilder/scripts/linux/ubuntu/tool_installs_ubuntu.sh @@ -11,6 +11,10 @@ ERR_STRONGSWAN_INSTALL_TIMEOUT=187 {{/* Timeout to install strongswan */}} ERR_UA_ESM_HOOK_CLEANUP=188 {{/* Error removing the apt ESM hook for Ubuntu Pro */}} ERR_UA_MASK_UNIT=189 {{/* Error stopping/disabling/masking an Ubuntu Pro background unit */}} ERR_UA_TOKEN_CLEANUP=190 {{/* Error removing the baked-in Ubuntu Pro machine token state */}} +ERR_AMD_ROCM_UNSUPPORTED_OS=191 {{/* AMD ROCm prebake is only supported on Ubuntu 24.04 amd64 */}} +ERR_AMD_ROCM_GPG_KEY_DOWNLOAD_TIMEOUT=192 {{/* Timeout waiting for AMD ROCm GPG key download */}} +ERR_AMD_ROCM_INSTALL_TIMEOUT=193 {{/* Timeout waiting for AMD ROCm package install */}} +ERR_AMD_ROCM_VALIDATE_FAIL=194 {{/* AMD ROCm prebake validation failed */}} ERR_NTP_INSTALL_TIMEOUT=10 {{/*Unable to install NTP */}} ERR_NTP_START_TIMEOUT=11 {{/* Unable to start NTP */}} @@ -220,6 +224,118 @@ listInstalledPackages() { apt list --installed } +setupAmdRocmAptRepos() { + local rocm_version="${1}" + local amdgpu_repo_version="${2}" + local rocm_gpg_keyring_path="/etc/apt/keyrings/rocm.gpg" + local rocm_gpg_key_download_path="/tmp/rocm.gpg.key" + local ubuntu_codename="${UBUNTU_CODENAME:-noble}" + + if ! command -v gpg >/dev/null 2>&1; then + apt_get_install 30 1 300 gnupg || exit $ERR_AMD_ROCM_INSTALL_TIMEOUT + fi + + mkdir -p "$(dirname "${rocm_gpg_keyring_path}")" + retrycmd_curl_file 120 5 25 "${rocm_gpg_key_download_path}" "https://repo.radeon.com/rocm/rocm.gpg.key" 300 || exit $ERR_AMD_ROCM_GPG_KEY_DOWNLOAD_TIMEOUT + gpg --dearmor --yes -o "${rocm_gpg_keyring_path}" "${rocm_gpg_key_download_path}" || exit $ERR_AMD_ROCM_GPG_KEY_DOWNLOAD_TIMEOUT + rm -f "${rocm_gpg_key_download_path}" + + cat > /etc/apt/sources.list.d/rocm.list < /etc/apt/sources.list.d/amdgpu.list < /etc/apt/preferences.d/repo-radeon-pin-600 < /etc/modules-load.d/amdgpu.conf +} + +validateAmdRocmPrebake() { + local marker_path="/opt/azure/amd-rocm/version" + local kernel_version + kernel_version="$(uname -r)" + + for package_name in amdgpu-dkms rocm rocm-core rocminfo rocm-smi-lib; do + dpkg-query -W "${package_name}" >/dev/null || return 1 + done + + dkms status amdgpu | grep -q "${kernel_version}.*installed" || return 1 + modinfo amdgpu >/dev/null || return 1 + ! grep -qsE '^[[:space:]]*(blacklist[[:space:]]+amdgpu|install[[:space:]]+amdgpu[[:space:]]+/bin/false)([[:space:]]|$)' /etc/modprobe.d/*.conf 2>/dev/null || return 1 + grep -qx amdgpu /etc/modules-load.d/amdgpu.conf || return 1 + command -v rocminfo >/dev/null || return 1 + command -v rocm-smi >/dev/null || return 1 + [ -f /opt/rocm/.info/version ] || return 1 + [ -f "${marker_path}" ] || return 1 +} + +installAmdRocmPrebake() { + local rocm_version="${AMD_ROCM_VERSION:-7.2.4}" + local amdgpu_repo_version="${AMD_ROCM_AMDGPU_REPO_VERSION:-30.30.4}" + local amdgpu_dkms_version="${AMD_ROCM_AMDGPU_DKMS_VERSION:-1:6.16.13.30300400-2341068.24.04}" + local rocm_package_version="${AMD_ROCM_PACKAGE_VERSION:-7.2.4.70204-93~24.04}" + local rocminfo_package_version="${AMD_ROCM_ROCMINFO_VERSION:-1.0.0.70204-93~24.04}" + local rocm_smi_lib_package_version="${AMD_ROCM_SMI_LIB_VERSION:-7.8.0.70204-93~24.04}" + local kernel_version + kernel_version="$(uname -r)" + + if [ "${UBUNTU_RELEASE}" != "24.04" ] || [ "$(isARM64)" -eq 1 ]; then + echo "AMD ROCm prebake is only supported on Ubuntu 24.04 amd64. Found Ubuntu ${UBUNTU_RELEASE}, CPU_ARCH=$(getCPUArch)." + exit $ERR_AMD_ROCM_UNSUPPORTED_OS + fi + + ensureAmdRocmModuleAutoload + setupAmdRocmAptRepos "${rocm_version}" "${amdgpu_repo_version}" + + apt_get_install 30 1 600 "linux-headers-${kernel_version}" "linux-modules-extra-${kernel_version}" || exit $ERR_AMD_ROCM_INSTALL_TIMEOUT + apt_get_install 30 1 1800 \ + "amdgpu-dkms=${amdgpu_dkms_version}" \ + "rocm=${rocm_package_version}" \ + "rocm-core=${rocm_package_version}" \ + "rocminfo=${rocminfo_package_version}" \ + "rocm-smi-lib=${rocm_smi_lib_package_version}" || exit $ERR_AMD_ROCM_INSTALL_TIMEOUT + + mkdir -p /opt/azure/amd-rocm + cat > /opt/azure/amd-rocm/version <