Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions aks-node-controller/parser/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -728,6 +728,13 @@ func getEnableNvidia(config *aksnodeconfigv1.Configuration) bool {
return false
}

func getEnableAmdGpu(config *aksnodeconfigv1.Configuration) bool {
if config.GpuConfig != nil && config.GpuConfig.EnableAmdGpu != nil {
return *config.GpuConfig.EnableAmdGpu
}
return false
}

func removeNewlines(str string) string {
sanitizedStr := strings.ReplaceAll(str, "\n", "")
sanitizedStr = strings.ReplaceAll(sanitizedStr, "\r", "")
Expand Down
1 change: 1 addition & 0 deletions aks-node-controller/parser/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string {
"API_SERVER_NAME": config.GetApiServerConfig().GetApiServerName(),
"IS_VHD": fmt.Sprintf("%v", getIsVHD(config.IsVhd)),
"GPU_NODE": fmt.Sprintf("%v", getEnableNvidia(config)),
"AMD_GPU_NODE": fmt.Sprintf("%v", getEnableAmdGpu(config)),
"SGX_NODE": fmt.Sprintf("%v", getIsSgxEnabledSKU(config.GetVmSize())),
"MIG_NODE": fmt.Sprintf("%v", getIsMIGNode(config.GetGpuConfig().GetGpuInstanceProfile())),
"CONFIG_GPU_DRIVER_IF_NEEDED": fmt.Sprintf("%v", config.GetGpuConfig().GetConfigGpuDriver()),
Expand Down
17 changes: 17 additions & 0 deletions aks-node-controller/parser/parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ func TestBuildCSECmd(t *testing.T) {
validator: func(cmd *exec.Cmd) {
vars := environToMap(cmd.Env)
assert.Equal(t, "false", vars["GPU_NODE"])
assert.Equal(t, "false", vars["AMD_GPU_NODE"])
assert.NotEmpty(t, vars["CONTAINERD_CONFIG_NO_GPU_CONTENT"])
// Ensure the containerd config does not use the
// nvidia container runtime when skipping the
Expand Down Expand Up @@ -69,6 +70,21 @@ oom_score = -999
require.Equal(t, expectedShimConfig, containerdConfigFileContent)
},
},
{
name: "AKSUbuntu2204 containerd with AMD GPU",
folder: "AKSUbuntu2204+Containerd+MIG",
k8sVersion: "1.19.13",
aksNodeConfigUpdator: func(aksNodeConfig *aksnodeconfigv1.Configuration) {
aksNodeConfig.GpuConfig.EnableNvidia = to.Ptr(false)
aksNodeConfig.GpuConfig.EnableAmdGpu = to.Ptr(true)
aksNodeConfig.VmSize = "Standard_ND96isr_MI300X_v5"
},
validator: func(cmd *exec.Cmd) {
vars := environToMap(cmd.Env)
assert.Equal(t, "false", vars["GPU_NODE"])
assert.Equal(t, "true", vars["AMD_GPU_NODE"])
},
},
{
name: "AKSUbuntu2204 DisableSSH with enabled ssh",
folder: "AKSUbuntu2204+SSHStatusOn",
Expand Down Expand Up @@ -452,6 +468,7 @@ func TestAKSNodeConfigCompatibilityFromJsonToCSECommand(t *testing.T) {
assertHasKeyWithValue(t, vars, "NETWORK_PLUGIN", "")
assertHasKeyWithValue(t, vars, "VNET_CNI_PLUGINS_URL", "")
assertHasKeyWithValue(t, vars, "GPU_NODE", "false")
assertHasKeyWithValue(t, vars, "AMD_GPU_NODE", "false")
assertHasKeyWithValue(t, vars, "GPU_INSTANCE_PROFILE", "")
assertHasKeyWithValue(t, vars, "CUSTOM_CA_TRUST_COUNT", "0")
assertHasKeyWithValue(t, vars, "SHOULD_CONFIGURE_CUSTOM_CA_TRUST", "false")
Expand Down
1 change: 1 addition & 0 deletions parts/linux/cloud-init/artifacts/cse_cmd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ IDENTITY_BINDINGS_LOCAL_AUTHORITY_SNI={{GetVariable "identityBindingsLocalAuthor
API_SERVER_NAME={{GetKubernetesEndpoint}}
IS_VHD={{GetVariable "isVHD"}}
GPU_NODE={{GetVariable "gpuNode"}}
AMD_GPU_NODE={{GetVariable "amdGpuNode"}}
SGX_NODE={{GetVariable "sgxNode"}}
MIG_NODE={{GetVariable "migNode"}}
CONFIG_GPU_DRIVER_IF_NEEDED={{GetVariable "configGPUDriverIfNeeded"}}
Expand Down
4 changes: 4 additions & 0 deletions parts/linux/cloud-init/artifacts/cse_helpers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,10 @@ ERR_AKS_NODE_CONTROLLER_ERROR=240 # Generic error in AKS Node Controller
ERR_AZNFS_RPM_DOWNLOAD_TIMEOUT=241 # Timeout downloading aznfs RPM from PMC
ERR_AZNFS_INSTALL_FAIL=242 # Failed to install aznfs RPM package
ERR_SECONDARY_NIC_CONFIG_FAIL=243 # Error configuring secondary NIC network interface
ERR_AMD_ROCM_UNSUPPORTED_OS=244 # AMD ROCm CSE install is only supported on Ubuntu 24.04 amd64
ERR_AMD_ROCM_GPG_KEY_DOWNLOAD_TIMEOUT=245 # Timeout waiting for AMD ROCm GPG key download
ERR_AMD_ROCM_INSTALL_TIMEOUT=246 # Timeout waiting for AMD ROCm package install
ERR_AMD_ROCM_VALIDATE_FAIL=247 # AMD ROCm CSE validation failed
# -----------------------------------------------------------------------------

# This probably wasn't launched via a login shell, so ensure the PATH is correct.
Expand Down
10 changes: 8 additions & 2 deletions parts/linux/cloud-init/artifacts/cse_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -423,8 +423,14 @@ function nodePrep {
# By default, never reboot new nodes.
REBOOTREQUIRED=false

# Install and configure GPU drivers if this is a GPU node
if [ "${GPU_NODE}" = "true" ] && [ "${skip_nvidia_driver_install}" != "true" ]; then
# Install and configure AMD GPU drivers if this is an AMD GPU node.
if [ "${AMD_GPU_NODE}" = "true" ]; then
echo $(date),$(hostname), "Start configuring AMD GPU drivers"
logs_to_events "AKS.CSE.ensureAmdGpuDrivers" ensureAmdGpuDrivers
echo $(date),$(hostname), "End configuring AMD GPU drivers"

# Install and configure NVIDIA GPU drivers if this is an NVIDIA GPU node.
elif [ "${GPU_NODE}" = "true" ] && [ "${skip_nvidia_driver_install}" != "true" ]; then
echo $(date),$(hostname), "Start configuring GPU drivers"

# Install GPU drivers
Expand Down
180 changes: 180 additions & 0 deletions parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,186 @@ removeNvidiaRepos() {
fi
}

amdRocmUbuntuRelease() {
if [ -n "${UBUNTU_RELEASE:-}" ]; then
echo "${UBUNTU_RELEASE}"
return 0
fi
. /etc/os-release
echo "${VERSION_ID}"
}

amdRocmUbuntuCodename() {
if [ -n "${UBUNTU_CODENAME:-}" ]; then
echo "${UBUNTU_CODENAME}"
return 0
fi
. /etc/os-release
echo "${VERSION_CODENAME}"
}

isAmdRocmSupportedSku() {
local vm_sku
vm_sku="$(get_compute_sku 2>/dev/null || true)"
case "${vm_sku,,}" in
standard_nd96isr_mi300x_v5|standard_nd96is_mi300x_v5)
return 0
;;
esac
echo "AMD ROCm CSE install is not supported for VM SKU '${vm_sku}'"
return 1
}

setupAmdRocmAptRepos() {
local rocm_version="${1}"
local amdgpu_repo_version="${2}"
local rocm_gpg_keyring_path="/etc/apt/keyrings/rocm.gpg"
local rocm_gpg_key_download_path="/tmp/rocm.gpg.key"
local ubuntu_codename
ubuntu_codename="$(amdRocmUbuntuCodename)"

if [ -n "${PROXY_VARS}" ]; then
eval "${PROXY_VARS}"
fi

if ! command -v gpg >/dev/null 2>&1; then
apt_get_install 30 1 300 gnupg || exit $ERR_AMD_ROCM_INSTALL_TIMEOUT
fi

mkdir -p "$(dirname "${rocm_gpg_keyring_path}")"
retrycmd_curl_file 120 5 25 "${rocm_gpg_key_download_path}" "https://repo.radeon.com/rocm/rocm.gpg.key" 300 || exit $ERR_AMD_ROCM_GPG_KEY_DOWNLOAD_TIMEOUT
gpg --dearmor --yes -o "${rocm_gpg_keyring_path}" "${rocm_gpg_key_download_path}" || exit $ERR_AMD_ROCM_GPG_KEY_DOWNLOAD_TIMEOUT
rm -f "${rocm_gpg_key_download_path}"

cat > /etc/apt/sources.list.d/rocm.list <<EOF
deb [arch=amd64 signed-by=${rocm_gpg_keyring_path}] https://repo.radeon.com/rocm/apt/${rocm_version} ${ubuntu_codename} main
deb [arch=amd64 signed-by=${rocm_gpg_keyring_path}] https://repo.radeon.com/graphics/${rocm_version}/ubuntu ${ubuntu_codename} main
EOF
Comment on lines +277 to +280

cat > /etc/apt/sources.list.d/amdgpu.list <<EOF
deb [arch=amd64 signed-by=${rocm_gpg_keyring_path}] https://repo.radeon.com/amdgpu/${amdgpu_repo_version}/ubuntu ${ubuntu_codename} main
EOF

cat > /etc/apt/preferences.d/repo-radeon-pin-600 <<EOF
Package: *
Pin: release o=repo.radeon.com
Pin-Priority: 600
EOF

apt_get_update || exit $ERR_APT_UPDATE_TIMEOUT
}

removeAmdRocmAptRepos() {
rm -f /etc/apt/sources.list.d/rocm.list
rm -f /etc/apt/sources.list.d/amdgpu.list
rm -f /etc/apt/sources.list.d/amdgpu-proprietary.list
rm -f /etc/apt/preferences.d/repo-radeon-pin-600
rm -f /etc/apt/keyrings/rocm.gpg
rm -f /var/lib/apt/lists/*repo.radeon.com*
}

amdRocmBinaryPath() {
local binary_name="${1}"
if command -v "${binary_name}" >/dev/null 2>&1; then
command -v "${binary_name}"
return 0
fi
if [ -x "/opt/rocm/bin/${binary_name}" ]; then
echo "/opt/rocm/bin/${binary_name}"
return 0
fi
return 1
}

ensureAmdRocmModuleAutoload() {
mkdir -p /etc/modules-load.d
for modprobe_conf in /etc/modprobe.d/*.conf; do
[ -f "${modprobe_conf}" ] || continue
sed -i '/^[[:space:]]*blacklist[[:space:]]\+amdgpu\([[:space:]]\|$\)/d' "${modprobe_conf}"
sed -i '/^[[:space:]]*install[[:space:]]\+amdgpu[[:space:]]\+\/bin\/false\([[:space:]]\|$\)/d' "${modprobe_conf}"
done
printf '%s\n' amdgpu > /etc/modules-load.d/amdgpu.conf
}

validateAmdRocmDriver() {
local kernel_version
local rocminfo_bin
local rocm_smi_bin
kernel_version="$(uname -r)"

for package_name in amdgpu-dkms libdrm-amdgpu-dev rocm-core rocminfo rocm-smi-lib; do
dpkg-query -W "${package_name}" >/dev/null 2>&1 || return 1
done

dkms status amdgpu | grep -q "${kernel_version}.*installed" || return 1
modinfo amdgpu >/dev/null 2>&1 || return 1
! grep -qsE '^[[:space:]]*(blacklist[[:space:]]+amdgpu|install[[:space:]]+amdgpu[[:space:]]+/bin/false)([[:space:]]|$)' /etc/modprobe.d/*.conf 2>/dev/null || return 1
grep -qx amdgpu /etc/modules-load.d/amdgpu.conf || return 1
retrycmd_if_failure 12 5 30 modprobe amdgpu || return 1
retrycmd_if_failure 12 5 5 test -e /dev/kfd || return 1
retrycmd_if_failure 12 5 5 bash -c "find /dev/dri -maxdepth 1 -name 'renderD*' -print -quit | grep -q ." || return 1

rocminfo_bin="$(amdRocmBinaryPath rocminfo)" || return 1
rocm_smi_bin="$(amdRocmBinaryPath rocm-smi)" || return 1
timeout 60 "${rocminfo_bin}" >/tmp/amd-rocminfo.out 2>&1 || return 1
grep -q "gfx942" /tmp/amd-rocminfo.out || return 1
timeout 60 "${rocm_smi_bin}" --showproductname >/tmp/amd-rocm-smi.out 2>&1 || return 1
grep -q "AMD Instinct MI300X VF" /tmp/amd-rocm-smi.out || return 1
}

ensureAmdGpuDrivers() {
local rocm_version="${AMD_ROCM_VERSION:-7.2.4}"
local amdgpu_repo_version="${AMD_ROCM_AMDGPU_REPO_VERSION:-30.30.4}"
local amdgpu_dkms_version="${AMD_ROCM_AMDGPU_DKMS_VERSION:-1:6.16.13.30300400-2341068.24.04}"
local libdrm_amdgpu_dev_version="${AMD_ROCM_LIBDRM_AMDGPU_DEV_VERSION:-1:2.4.125.07020400-2341098.24.04}"
Comment on lines +353 to +357
local rocm_package_version="${AMD_ROCM_PACKAGE_VERSION:-7.2.4.70204-93~24.04}"
local rocminfo_package_version="${AMD_ROCM_ROCMINFO_VERSION:-1.0.0.70204-93~24.04}"
local rocm_smi_lib_package_version="${AMD_ROCM_SMI_LIB_VERSION:-7.8.0.70204-93~24.04}"
local kernel_version
local ubuntu_release
kernel_version="$(uname -r)"
ubuntu_release="$(amdRocmUbuntuRelease)"

if [ "${OS}" != "${UBUNTU_OS_NAME}" ] || [ "${ubuntu_release}" != "24.04" ] || [ "$(isARM64)" -eq 1 ]; then
echo "AMD ROCm CSE install is only supported on Ubuntu 24.04 amd64. Found OS=${OS}, Ubuntu=${ubuntu_release}, CPU_ARCH=$(getCPUArch)."
exit $ERR_AMD_ROCM_UNSUPPORTED_OS
fi
isAmdRocmSupportedSku || exit $ERR_AMD_ROCM_UNSUPPORTED_OS
ensureAmdRocmModuleAutoload

if [ -f /opt/azure/amd-rocm/version ] && validateAmdRocmDriver; then
echo "AMD ROCm driver is already installed and validated"
return 0
fi

setupAmdRocmAptRepos "${rocm_version}" "${amdgpu_repo_version}"

apt_get_install 30 1 600 "linux-headers-${kernel_version}" "linux-modules-extra-${kernel_version}" || exit $ERR_AMD_ROCM_INSTALL_TIMEOUT
apt_get_install 30 1 2400 \
"amdgpu-dkms=${amdgpu_dkms_version}" \
"libdrm-amdgpu-dev=${libdrm_amdgpu_dev_version}" \
"rocm-core=${rocm_package_version}" \
"rocminfo=${rocminfo_package_version}" \
"rocm-smi-lib=${rocm_smi_lib_package_version}" || exit $ERR_AMD_ROCM_INSTALL_TIMEOUT
ldconfig || exit $ERR_AMD_ROCM_INSTALL_TIMEOUT

mkdir -p /opt/azure/amd-rocm
cat > /opt/azure/amd-rocm/version <<EOF
install_mode=cse
package_set=minimal-host
rocm_version=${rocm_version}
amdgpu_repo_version=${amdgpu_repo_version}
amdgpu_dkms_version=${amdgpu_dkms_version}
libdrm_amdgpu_dev_version=${libdrm_amdgpu_dev_version}
kernel=${kernel_version}
installed_at=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
EOF
chmod 644 /opt/azure/amd-rocm/version

validateAmdRocmDriver || exit $ERR_AMD_ROCM_VALIDATE_FAIL
removeAmdRocmAptRepos
}

# cleanUpPrebakedGPUDriver removes a CUDA driver pre-baked into the shared VHD on any node that does
# NOT install the AKS-managed driver -- the cleanUpGPUDrivers path (GPU_NODE != true OR
# skip_nvidia_driver_install=true): non-GPU VMs, and GPU VMs opted out via --gpu-driver None or the
Expand Down
14 changes: 14 additions & 0 deletions pkg/agent/baker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1124,6 +1124,20 @@ var _ = Describe("getLinuxNodeCSECommand", func() {
Expect(vars).To(HaveKeyWithValue("GPU_DRIVER_TYPE", "cuda-lts"))
})

It("should handle AMD GPU configuration", func() {
baseConfig.EnableAMDGPU = true
baseConfig.AgentPoolProfile.VMSize = "Standard_ND96isr_MI300X_v5"

cseCmd := templateGenerator.getLinuxNodeCSECommand(baseConfig)

Expect(cseCmd).NotTo(BeEmpty())
Expect(strings.Contains(cseCmd, "\n")).To(BeFalse())

vars := decodeCSEVars(cseCmd)
Expect(vars).To(HaveKeyWithValue("GPU_NODE", "false"))
Expect(vars).To(HaveKeyWithValue("AMD_GPU_NODE", "true"))
})

It("should handle custom cloud environment", func() {
baseConfig.ContainerService.Properties.CustomCloudEnv = &datamodel.CustomCloudEnv{
Name: "akscustom",
Expand Down
1 change: 1 addition & 0 deletions pkg/agent/variables.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ func getCSECommandVariables(config *datamodel.NodeBootstrappingConfiguration) pa
"userAssignedIdentityID": config.UserAssignedIdentityClientID,
"isVHD": isVHD(profile),
"gpuNode": strconv.FormatBool(config.EnableNvidia),
"amdGpuNode": strconv.FormatBool(config.EnableAMDGPU),
"sgxNode": strconv.FormatBool(datamodel.IsSgxEnabledSKU(profile.VMSize)),
"configGPUDriverIfNeeded": config.ConfigGPUDriverIfNeeded,
"enableGPUDevicePluginIfNeeded": config.EnableGPUDevicePluginIfNeeded,
Expand Down
12 changes: 12 additions & 0 deletions pkg/agent/variables_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,18 @@ var _ = Describe("Windows CSE variables check", func() {
Expect(vars["configGPUDriverIfNeeded"]).To(Equal(false))
})

It("sets amdGpuNode to true", func() {
config.EnableAMDGPU = true
vars := getCSECommandVariables(config)
Expect(vars["amdGpuNode"]).To(Equal("true"))
})

It("sets amdGpuNode to false", func() {
config.EnableAMDGPU = false
vars := getCSECommandVariables(config)
Expect(vars["amdGpuNode"]).To(Equal("false"))
})

It("sets windowsSecureTlsEnabled to true", func() {
value := true
config.ContainerService.Properties.WindowsProfile.WindowsSecureTlsEnabled = &value
Expand Down
9 changes: 9 additions & 0 deletions vhdbuilder/packer/install-dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -827,6 +827,15 @@ EOF
fi
fi

if grep -q "AMD_ROCM" <<< "$FEATURE_FLAGS"; then
echo "Installing AMD ROCm and AMDGPU driver into the VHD"
installAmdRocmPrebake
cat << EOF >> ${VHD_LOGS_FILEPATH}
- amd-rocm-prebake=$(cat /opt/azure/amd-rocm/version | tr '\n' ' ')
EOF
fi
capture_benchmark "${SCRIPT_NAME}_install_amd_rocm_prebake"

if [ -d "/opt/gpu" ] && [ "$(ls -A /opt/gpu)" ]; then
ls -ltr /opt/gpu/* >> ${VHD_LOGS_FILEPATH}
fi
Expand Down
Loading
Loading