diff --git a/.github/workflows/azure-dev.yml b/.github/workflows/azure-dev.yml
index 9dd0621..bf6fca2 100644
--- a/.github/workflows/azure-dev.yml
+++ b/.github/workflows/azure-dev.yml
@@ -59,21 +59,27 @@ jobs:
uses: azure/setup-kubectl@v3
- name: Install helm
uses: azure/setup-helm@v4
- - name: Log in with Azure (Federated Credentials)
+ - name: Login to Azure
+ uses: azure/login@v2
+ with:
+ client-id: ${{ vars.AZURE_CLIENT_ID }}
+ tenant-id: ${{ vars.AZURE_TENANT_ID }}
+ subscription-id: ${{ vars.AZURE_SUBSCRIPTION_ID }}
+ - name: Set Azure subscription
+ shell: bash
+ run: az account set --subscription "$AZURE_SUBSCRIPTION_ID"
+ - name: Login to AZD
+ shell: bash
run: |
- azd auth login `
- --client-id "$Env:AZURE_CLIENT_ID" `
- --federated-credential-provider "github" `
- --tenant-id "$Env:AZURE_TENANT_ID"
- shell: pwsh
-
- - name: Provision Infrastructure
- run: azd provision --no-prompt
- env:
- AZD_INITIAL_ENVIRONMENT_CONFIG: ${{ secrets.AZD_INITIAL_ENVIRONMENT_CONFIG }}
-
- # NOTE: azure.yaml has no services: block, so azd deploy is a no-op.
- # The postprovision hook handles all post-Bicep setup (Arc, GPU, VI extension).
- # This step is kept for forward-compatibility if app services are added.
- - name: Deploy Application
- run: azd deploy --no-prompt
+ azd auth login \
+ --client-id "$AZURE_CLIENT_ID" \
+ --federated-credential-provider "github" \
+ --tenant-id "$AZURE_TENANT_ID"
+ - name: Provision
+ shell: bash
+ run: |
+ if ! azd env select "$AZURE_ENV_NAME"; then
+ azd env new "$AZURE_ENV_NAME" --subscription "$AZURE_SUBSCRIPTION_ID" --location "$AZURE_LOCATION" --no-prompt
+ fi
+ azd config set defaults.subscription "$AZURE_SUBSCRIPTION_ID"
+ azd up --no-prompt
diff --git a/README.md b/README.md
index bf8194f..8badad3 100644
--- a/README.md
+++ b/README.md
@@ -25,11 +25,11 @@ By deploying the Video Indexer Arc extension on an Arc-enabled AKS cluster with
This solution optionally creates a Microsoft Foundry project and Foundry Tools (enabled by default; set `CREATE_FOUNDRY_PROJECT=false` to skip). More details about the resources can be found in the [resources](#resources) documentation.
-### Solution architecture
+## Solution architecture
||
|---|
-### Key features
+## Features
Learn more about the key features this solution enables
@@ -44,9 +44,10 @@ This solution optionally creates a Microsoft Foundry project and Foundry Tools (
-Getting Started
+Quick Deploy
+## Getting Started
Follow the quick deploy steps on the deployment guide to deploy this solution to your own Azure subscription.
> **Note:** This solution accelerator requires **Azure Developer CLI (azd) version 1.18.0 or higher**. Please ensure you have the latest version installed before proceeding with deployment. [Download azd here](https://learn.microsoft.com/en-us/azure/developer/azure-developer-cli/install-azd).
diff --git a/azure.yaml b/azure.yaml
index feac005..fb06423 100644
--- a/azure.yaml
+++ b/azure.yaml
@@ -50,13 +50,6 @@ hooks:
shell: sh
run: ./hooks/postup.sh
interactive: true
- predown:
- windows:
- shell: pwsh
- run: ./hooks/predown.ps1
- posix:
- shell: sh
- run: ./hooks/predown.sh
pipeline:
variables:
- AZURE_RESOURCE_GROUP
diff --git a/hooks/common.ps1 b/hooks/common.ps1
index 40627ce..913659c 100644
--- a/hooks/common.ps1
+++ b/hooks/common.ps1
@@ -7,6 +7,54 @@
. "$PSScriptRoot/config.ps1"
. "$PSScriptRoot/ui.ps1"
+# ── Helpers ─────────────────────────────────────────────────────────────────
+
+function Invoke-AzJson {
+ <#
+ .SYNOPSIS
+ Runs an `az` command (passed as a script block or string array), captures
+ stdout, and returns parsed JSON. Returns $null if the command produced no
+ output or if parsing failed. Never throws — callers must null-check.
+ #>
+ param([Parameter(Mandatory)] [scriptblock]$Command)
+ try {
+ $raw = & $Command 2>&1
+ if ($LASTEXITCODE -ne 0) {
+ $snippet = if ($raw) { ($raw | Out-String).Trim() } else { '(no output)' }
+ if ($snippet.Length -gt 400) { $snippet = $snippet.Substring(0, 400) + '...' }
+ Log-Warning "az command failed (exit $LASTEXITCODE): $snippet"
+ return $null
+ }
+ if ($null -eq $raw -or ($raw -is [string] -and [string]::IsNullOrWhiteSpace($raw))) {
+ return $null
+ }
+ if ($raw -is [array]) { $raw = ($raw -join "`n") }
+ if ([string]::IsNullOrWhiteSpace($raw)) { return $null }
+ return $raw | ConvertFrom-Json -ErrorAction Stop
+ }
+ catch {
+ Log-Warning "Invoke-AzJson exception: $($_.Exception.Message)"
+ return $null
+ }
+}
+
+function Invoke-AzdEnvSet {
+ <#
+ .SYNOPSIS
+ Persists a key/value via `azd env set`, warning (but not failing) on error.
+ #>
+ param(
+ [Parameter(Mandatory)] [string]$Name,
+ [Parameter(Mandatory)] [AllowEmptyString()] [string]$Value
+ )
+ & azd env set $Name $Value 2>$null
+ if ($LASTEXITCODE -ne 0) {
+ Log-Warning "Could not persist '$Name' via 'azd env set' (exit $LASTEXITCODE)."
+ return $false
+ }
+ return $true
+}
+
# ── Prerequisite Checks ─────────────────────────────────────────────────────
function Assert-EnvVars {
@@ -125,16 +173,21 @@ function Connect-AksCluster {
function Get-RunningPodCount {
<#
.SYNOPSIS
- Returns the number of Running pods in a given namespace.
+ Returns the number of "healthy" pods in a namespace.
+ Counts both Running and Succeeded pods — some workloads (e.g. the
+ GPU operator's cuda-validator / install jobs) intentionally end in
+ Succeeded and should not be reported as degraded.
#>
param(
[string]$Namespace,
[string]$KubeContext
)
try {
- $count = (kubectl --context $KubeContext get pods -n $Namespace `
+ $running = (kubectl --context $KubeContext get pods -n $Namespace `
--field-selector=status.phase=Running --no-headers 2>$null | Measure-Object -Line).Lines
- return [int]$count
+ $succeeded = (kubectl --context $KubeContext get pods -n $Namespace `
+ --field-selector=status.phase=Succeeded --no-headers 2>$null | Measure-Object -Line).Lines
+ return [int]$running + [int]$succeeded
}
catch {
return 0
@@ -186,14 +239,15 @@ function Get-AzVmSizesForRegion {
Queries az vm list-skus for a region and returns an array of hashtables.
Uses list-skus (not list-sizes) to respect subscription restrictions —
only SKUs the subscription is allowed to use are returned.
- Each entry: @{ Name; Cores; MemoryGB }
+ Each entry: @{ Name; Cores; MemoryGB; Family }
+ Family is the quota family name (matches az vm list-usage name.value).
#>
param([string]$Location)
- $query = "[?restrictions[?type=='Location']|length(@)==``0``].{name:name, vCPUs:capabilities[?name=='vCPUs'].value|[0], memGB:capabilities[?name=='MemoryGB'].value|[0]}"
- $raw = (az vm list-skus --location $Location --resource-type virtualMachines --query $query -o json 2>$null) | ConvertFrom-Json
+ $query = "[?restrictions[?type=='Location']|length(@)==``0``].{name:name, family:family, vCPUs:capabilities[?name=='vCPUs'].value|[0], memGB:capabilities[?name=='MemoryGB'].value|[0]}"
+ $raw = Invoke-AzJson { az vm list-skus --location $Location --resource-type virtualMachines --query $query -o json }
if (-not $raw) { return @() }
return $raw | ForEach-Object {
- @{ Name = $_.name; Cores = [int]$_.vCPUs; MemoryGB = [int]$_.memGB }
+ @{ Name = $_.name; Cores = [int]$_.vCPUs; MemoryGB = [int]$_.memGB; Family = $_.family }
}
}
@@ -238,6 +292,11 @@ function Select-VmSizesForMenu {
For each family, matches VM names that contain the family pattern,
filters by core range, takes up to $SizesPerFamily sorted by cores.
The default SKU is always included. Result is sorted by cores.
+
+ When QuotaData is supplied, each entry is annotated with quota info
+ (AvailableQuota, QuotaLimit, QuotaFamily, HasEnoughQuota). SKUs whose
+ family has too little quota to run $MaxNodes of that size are dropped,
+ except the default SKU which is kept (annotated) so the user sees it.
#>
param(
[array]$AllSizes,
@@ -246,7 +305,9 @@ function Select-VmSizesForMenu {
[string]$DefaultSku,
[int]$SizesPerFamily = 3,
[int]$MinCores = 0,
- [int]$MaxCores = [int]::MaxValue
+ [int]$MaxCores = [int]::MaxValue,
+ [hashtable]$QuotaData,
+ [int]$MaxNodes = 1
)
# Filter by broad prefix first (CPU vs GPU), then by core range
@@ -273,8 +334,49 @@ function Select-VmSizesForMenu {
if ($defVm) { $selected[$defVm.Name] = $defVm }
}
- # Return sorted by cores
- return $selected.Values | Sort-Object { $_.Cores }, { $_.Name }
+ $entries = $selected.Values | Sort-Object { $_.Cores }, { $_.Name }
+
+ # ── Annotate with quota + drop SKUs that cannot satisfy $MaxNodes ─────
+ if ($QuotaData -and $QuotaData.Count -gt 0) {
+ $annotated = @()
+ foreach ($vm in $entries) {
+ $fam = Get-QuotaFamilyForVm -VmSize $vm.Name -SkuFamily $vm.Family
+ $avail = $null; $limit = $null; $hasEnough = $true; $familyKnown = $false
+ if ($fam -and $QuotaData.ContainsKey($fam)) {
+ $familyKnown = $true
+ $avail = [int]$QuotaData[$fam].Available
+ $limit = [int]$QuotaData[$fam].Limit
+ $needed = [int]$vm.Cores * [math]::Max(1, [int]$MaxNodes)
+ $hasEnough = ($limit -gt 0) -and ($avail -ge $needed)
+ }
+ # Clone hashtable so we don't mutate the shared $AllSizes entries
+ $copy = @{}
+ foreach ($k in $vm.Keys) { $copy[$k] = $vm[$k] }
+ $copy.QuotaFamily = $fam
+ $copy.QuotaFamilyKnown = $familyKnown
+ $copy.AvailableQuota = $avail
+ $copy.QuotaLimit = $limit
+ $copy.HasEnoughQuota = $hasEnough
+ $annotated += ,$copy
+ }
+
+ # Keep SKUs that either (a) have enough quota in a known family,
+ # (b) are in an unknown family (can't verify — don't hide newer SKUs),
+ # or (c) are the configured default. Unknown families are treated as
+ # "OK" here; the final quota check before submission still guards them.
+ $filtered = $annotated | Where-Object {
+ (-not $_.QuotaFamilyKnown) -or $_.HasEnoughQuota -or ($_.Name -eq $DefaultSku)
+ }
+
+ # Fallback: if quota would empty the list, return the unfiltered annotated
+ # set so the user can still pick something and be warned.
+ if (-not $filtered -or @($filtered).Count -eq 0) {
+ return $annotated
+ }
+ return @($filtered)
+ }
+
+ return $entries
}
function Get-AzVmQuotaForRegion {
@@ -285,7 +387,7 @@ function Get-AzVmQuotaForRegion {
#>
param([string]$Location)
$result = @{}
- $raw = (az vm list-usage --location $Location -o json 2>$null) | ConvertFrom-Json
+ $raw = Invoke-AzJson { az vm list-usage --location $Location -o json }
if (-not $raw) { return $result }
foreach ($q in $raw) {
$result[$q.name.value] = @{
@@ -300,14 +402,21 @@ function Get-AzVmQuotaForRegion {
function Get-QuotaFamilyForVm {
<#
.SYNOPSIS
- Resolves the quota family name for a GPU VM size using the
- GPU_QUOTA_FAMILY_MAP regex lookup table. No API call needed.
- Returns the family string, or $null if no pattern matches.
+ Resolves the quota family name for a VM size.
+ Prefers the family string reported by az vm list-skus (passed via
+ -SkuFamily) because it matches az vm list-usage's name.value directly.
+ Falls back to the GPU_QUOTA_FAMILY_MAP regex table for older SKUs where
+ the family field is empty.
+ Returns the family string, or $null if nothing matches.
#>
param(
[string]$VmSize,
+ [string]$SkuFamily,
[string]$Location # kept for interface compat, not used
)
+ if (-not [string]::IsNullOrWhiteSpace($SkuFamily)) {
+ return $SkuFamily
+ }
foreach ($pattern in $GPU_QUOTA_FAMILY_MAP.Keys) {
if ($VmSize -match $pattern) {
return $GPU_QUOTA_FAMILY_MAP[$pattern]
@@ -378,12 +487,10 @@ function Resolve-ModelQuota {
$modelType = "$Format.$DeploymentType.$Model"
Log-Info "Checking quota for $modelType in $Location..."
- $modelInfo = $null
- try {
- $modelInfo = (az cognitiveservices usage list --location $Location `
- --query "[?name.value=='$modelType'] | [0]" -o json 2>$null) | ConvertFrom-Json
+ $modelInfo = Invoke-AzJson {
+ az cognitiveservices usage list --location $Location `
+ --query "[?name.value=='$modelType'] | [0]" -o json
}
- catch { }
if (-not $modelInfo) {
Log-Warning "No quota info found for '$modelType' in '$Location'. Skipping quota check."
@@ -421,7 +528,7 @@ function Resolve-ModelQuota {
}
} while (-not $validInput)
- azd env set $CapacityEnvVarName $parsed 2>$null
+ [void](Invoke-AzdEnvSet -Name $CapacityEnvVarName -Value "$parsed")
Log-Success "Capacity adjusted to $parsed (saved to $CapacityEnvVarName)"
}
else {
@@ -502,8 +609,20 @@ function Show-VmSelectionMenu {
$name = $Entry.Name.PadRight(35)
$cores = "$($Entry.Cores) vCPUs".PadRight(10)
$mem = "$($Entry.MemoryGB) GB".PadRight(8)
- $tag = if ($IsDefault) { " (default)" } else { "" }
- return "${name} ${cores} ${mem}${tag}"
+
+ # Quota column (only when quota data was supplied)
+ $quotaCol = ""
+ if ($Entry.ContainsKey('QuotaFamilyKnown')) {
+ if ($Entry.QuotaFamilyKnown) {
+ $quotaCol = "$($Entry.AvailableQuota) free".PadRight(14)
+ }
+ else {
+ $quotaCol = "quota n/a".PadRight(14)
+ }
+ }
+
+ $tag = if ($IsDefault) { " (default)" } else { "" }
+ return "${name} ${cores} ${mem} ${quotaCol}${tag}"
}
# ── Redraw the visible viewport in-place ───────────────────────
@@ -577,6 +696,9 @@ function Show-VmSelectionMenu {
Write-Host ""
Write-Section "Select VM size for $PoolName ($($VmSizes.Count) sizes available)"
Log-Info "Use $([char]0x2191)/$([char]0x2193) to move, Enter to select, C custom, Esc cancel"
+ if ($VmSizes.Count -gt 0 -and $VmSizes[0].ContainsKey('QuotaFamilyKnown')) {
+ Log-Info "Quota column shows cores free in this region (pool max nodes: $MaxNodes)."
+ }
Write-Host ""
# Reserve exactly $maxVisible blank lines (viewport size, not total items)
@@ -660,17 +782,17 @@ function Show-VmSelectionMenu {
# ── GPU quota check ────────────────────────────────────────
if ($IsGpu) {
Write-LogMessage -Message "Resolving quota family..." -Symbol $script:Sym.Info -SymbolColor $script:C.Accent -NoNewline
- $selectedFamily = Get-QuotaFamilyForVm -VmSize $selectedSku -Location $Location
+ $skuFamily = $null
+ $match = $VmSizes | Where-Object { $_.Name -eq $selectedSku } | Select-Object -First 1
+ if ($match -and $match.ContainsKey('Family')) { $skuFamily = $match.Family }
+ $selectedFamily = Get-QuotaFamilyForVm -VmSize $selectedSku -SkuFamily $skuFamily -Location $Location
if ($selectedFamily) {
Write-Host " $($script:C.Muted)$selectedFamily$($script:C.Reset)"
$totalCoresNeeded = $selectedCores * $MaxNodes
$quotaResult = Assert-VmQuota -Label $PoolName -Family $selectedFamily -QuotaData $QuotaData -CoresNeeded $totalCoresNeeded
if ($quotaResult -in @("zero", "low")) {
- $proceed = Read-Host " Continue with this VM anyway? (y = keep, n = re-select) [n]"
- if ($proceed -ne 'y' -and $proceed -ne 'Y') {
- Log-Warning "Re-showing menu..."
- continue
- }
+ Log-Warning "Re-showing menu..."
+ continue
}
}
else {
diff --git a/hooks/common.sh b/hooks/common.sh
index 1f2b747..46540d3 100755
--- a/hooks/common.sh
+++ b/hooks/common.sh
@@ -9,13 +9,27 @@ HOOKS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "${HOOKS_DIR}/config.sh"
source "${HOOKS_DIR}/ui.sh"
+# ── Helpers ─────────────────────────────────────────────────────────────────
+
+# Persist a key/value via `azd env set`, warning (but not failing) on error.
+# Usage: azd_env_set NAME VALUE
+azd_env_set() {
+ local name="$1"
+ local value="${2:-}"
+ if ! azd env set "$name" "$value" 2>/dev/null; then
+ log_warning "Could not persist '${name}' via 'azd env set'."
+ return 1
+ fi
+ return 0
+}
+
# ── Prerequisite Checks ─────────────────────────────────────────────────────
assert_env_vars() {
# Usage: assert_env_vars VAR1 VAR2 VAR3
local missing=()
for var in "$@"; do
- eval val=\$$var 2>/dev/null || val=""
+ val="${!var:-}"
if [ -z "$val" ]; then
missing+=("$var")
fi
@@ -74,7 +88,7 @@ register_required_providers() {
local providers_registering=0
for provider in "${providers[@]}"; do
local state
- state=$(az provider show -n "$provider" --query "registrationState" -o tsv 2>/dev/null || echo "Unknown")
+ state=$(az provider show -n "$provider" --query "registrationState" -o tsv 2>/dev/null | tr -d '\r' || echo "Unknown")
case "$state" in
Registered)
log_success "$provider"
@@ -126,10 +140,17 @@ connect_aks_cluster() {
get_running_pod_count() {
# Usage: count=$(get_running_pod_count namespace kube_context)
+ # Counts both Running and Succeeded pods — some workloads (e.g. the GPU
+ # operator's cuda-validator / install jobs) intentionally end in Succeeded
+ # and should not be reported as degraded.
local namespace="$1"
local kube_context="$2"
- kubectl --context "$kube_context" get pods -n "$namespace" \
- --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' '
+ local running succeeded
+ running=$(kubectl --context "$kube_context" get pods -n "$namespace" \
+ --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ')
+ succeeded=$(kubectl --context "$kube_context" get pods -n "$namespace" \
+ --field-selector=status.phase=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ')
+ echo $(( ${running:-0} + ${succeeded:-0} ))
}
get_total_pod_count() {
@@ -151,20 +172,33 @@ test_namespace_exists() {
# Fetches VM sizes for a region, filtered by name prefixes, sorted by cores.
# Uses az vm list-skus (not list-sizes) to respect subscription restrictions.
-# Output: pipe-delimited lines "name|cores|memGB" to stdout.
+# Output: pipe-delimited lines "name|cores|memGB|family" to stdout.
+# family is the az vm list-skus "family" field — matches az vm list-usage
+# name.value so we can look up quota directly.
get_filtered_vm_sizes() {
local location="$1"; shift
local -a prefixes=("$@")
- local filter=""
- for p in "${prefixes[@]}"; do
- [ -n "$filter" ] && filter="${filter} || "
- filter="${filter}starts_with(name, '${p}')"
- done
+
+ # Build a JMESPath expression like:
+ # (starts_with(name, 'Standard_D') || starts_with(name, 'Standard_E'))
+ # so Azure does the prefix filtering server-side instead of piping every
+ # unrestricted SKU through bash.
+ local prefix_expr=""
+ if [ "${#prefixes[@]}" -gt 0 ]; then
+ local joined="" sep=""
+ for p in "${prefixes[@]}"; do
+ joined+="${sep}starts_with(name, \`${p}\`)"
+ sep=" || "
+ done
+ prefix_expr=" && (${joined})"
+ fi
+
+ local query="[?(restrictions[?type==\`Location\`]|length(@)==\`0\`)${prefix_expr}].{n:name, c:capabilities[?name==\`vCPUs\`].value|[0], m:capabilities[?name==\`MemoryGB\`].value|[0], f:family}"
az vm list-skus --location "$location" --resource-type virtualMachines \
- --query "sort_by([?restrictions[?type=='Location']|length(@)==\`0\` && (${filter})], &to_number(capabilities[?name=='vCPUs'].value|[0]))[].{n:name, c:capabilities[?name=='vCPUs'].value|[0], m:capabilities[?name=='MemoryGB'].value|[0]}" \
- -o tsv 2>/dev/null | while IFS=$'\t' read -r name cores memGB; do
- echo "${name}|${cores}|${memGB}"
- done
+ --query "$query" -o tsv 2>/dev/null | tr -d '\r' | while IFS=$'\t' read -r name cores memGB family; do
+ [ -z "$name" ] && continue
+ echo "${name}|${cores}|${memGB}|${family}"
+ done | sort -t'|' -k2 -n
}
# Checks if a default SKU is available; if not, picks the closest by core count.
@@ -188,7 +222,7 @@ resolve_default_sku() {
# Not available — pick closest by core count
log_warning "Default SKU '$default_sku' is not available in this subscription/region." >&2
- local best_sku="" best_diff=999999
+ local best_sku="" best_diff=999999 best_cores=0
for entry in "${_sizes[@]}"; do
local sku="${entry%%|*}"
local rest="${entry#*|}"
@@ -231,7 +265,7 @@ select_vm_sizes_for_menu() {
local cores="${rest%%|*}"
[ "$cores" -lt "$min_cores" ] && continue
[ "$cores" -gt "$max_cores" ] && continue
- if echo "$sku" | grep -qE "$pattern"; then
+ if [[ "$sku" =~ $pattern ]]; then
if [ -z "${seen[$sku]+x}" ]; then
seen[$sku]=1
_out+=("$entry")
@@ -250,10 +284,13 @@ select_vm_sizes_for_menu() {
done
fi
- # Sort output by cores
- local -a sorted
- mapfile -t sorted < <(printf '%s\n' "${_out[@]}" | sort -t'|' -k2 -n)
- _out=("${sorted[@]}")
+ # Sort output by cores (guard: empty array → printf with no args emits a
+ # bare newline that mapfile captures as one phantom empty-string element)
+ if [ "${#_out[@]}" -gt 0 ]; then
+ local -a sorted
+ mapfile -t sorted < <(printf '%s\n' "${_out[@]}" | sort -t'|' -k2 -n)
+ _out=("${sorted[@]}")
+ fi
}
# Looks up quota for a family directly via az CLI (single call).
@@ -267,7 +304,7 @@ lookup_vm_quota() {
local raw
raw=$(az vm list-usage --location "$location" \
--query "[?name.value=='${family}'] | [0].{l:limit, u:currentValue}" \
- -o tsv 2>/dev/null || true)
+ -o tsv 2>/dev/null | tr -d '\r' || true)
[ -z "$raw" ] && return 1
VM_QUOTA_LIMIT=$(echo "$raw" | cut -f1)
VM_QUOTA_USED=$(echo "$raw" | cut -f2)
@@ -275,13 +312,21 @@ lookup_vm_quota() {
return 0
}
-# Resolves quota family for a GPU VM using the pattern lookup table.
+# Resolves quota family for a VM size.
+# Prefers the sku_family argument (from az vm list-skus "family" field, which
+# matches az vm list-usage name.value). Falls back to the regex table in
+# GPU_QUOTA_PATTERNS / GPU_QUOTA_FAMILIES for older SKUs with an empty family.
GET_QUOTA_FAMILY_RESULT=""
get_quota_family_for_vm() {
local vm_size="$1"
+ local sku_family="${2:-}"
GET_QUOTA_FAMILY_RESULT=""
+ if [ -n "$sku_family" ]; then
+ GET_QUOTA_FAMILY_RESULT="$sku_family"
+ return 0
+ fi
for ((i=0; i<${#GPU_QUOTA_PATTERNS[@]}; i++)); do
- if echo "$vm_size" | grep -qE "${GPU_QUOTA_PATTERNS[$i]}"; then
+ if [[ "$vm_size" =~ ${GPU_QUOTA_PATTERNS[$i]} ]]; then
GET_QUOTA_FAMILY_RESULT="${GPU_QUOTA_FAMILIES[$i]}"
return 0
fi
@@ -289,6 +334,87 @@ get_quota_family_for_vm() {
return 1
}
+# Fetches full per-family quota map for a region in a single call.
+# Populates the global associative array VM_QUOTA_MAP[family]="avail|limit".
+declare -gA VM_QUOTA_MAP=()
+VM_QUOTA_MAP_COUNT=0
+fetch_vm_quota_map() {
+ local location="$1"
+ VM_QUOTA_MAP=()
+ VM_QUOTA_MAP_COUNT=0
+ local raw
+ raw=$(az vm list-usage --location "$location" \
+ --query "[].{n:name.value, l:limit, u:currentValue}" \
+ -o tsv 2>/dev/null | tr -d '\r' || true)
+ [ -z "$raw" ] && return 1
+ while IFS=$'\t' read -r fam limit used; do
+ [ -z "$fam" ] && continue
+ local avail=$((limit - used))
+ VM_QUOTA_MAP["$fam"]="${avail}|${limit}"
+ VM_QUOTA_MAP_COUNT=$((VM_QUOTA_MAP_COUNT + 1))
+ done <<< "$raw"
+ return 0
+}
+
+# Annotates + filters a VM size array by quota.
+# Input entries: "name|cores|memGB|family" (from get_filtered_vm_sizes).
+# Output entries: "name|cores|memGB|family|avail|limit|hasEnough" where:
+# - avail/limit are empty when family is unknown to the quota map
+# - hasEnough is 1/0 (always 1 when family unknown, to avoid hiding SKUs)
+# SKUs that can't satisfy $max_nodes are dropped, except $default_sku which is
+# kept (annotated) so the user always sees it. If filtering would empty the
+# list, the unfiltered annotated set is returned instead.
+annotate_vm_sizes_with_quota() {
+ local -n _list=$1
+ local default_sku="${2:-}"
+ local max_nodes="${3:-1}"
+ [ "$max_nodes" -lt 1 ] && max_nodes=1
+
+ local -a annotated=() kept=()
+
+ for entry in "${_list[@]}"; do
+ local name="${entry%%|*}"
+ local rest="${entry#*|}"
+ local cores="${rest%%|*}"; rest="${rest#*|}"
+ local mem="${rest%%|*}"; rest="${rest#*|}"
+ local family="${rest}"
+
+ get_quota_family_for_vm "$name" "$family" >/dev/null || true
+ local fam="$GET_QUOTA_FAMILY_RESULT"
+
+ local avail="" limit="" has_enough=1 family_known=0
+ if [ -n "$fam" ] && [ -n "${VM_QUOTA_MAP[$fam]+x}" ]; then
+ family_known=1
+ local qinfo="${VM_QUOTA_MAP[$fam]}"
+ avail="${qinfo%%|*}"
+ limit="${qinfo##*|}"
+ local needed=$((cores * max_nodes))
+ if [ "$limit" -le 0 ] || [ "$avail" -lt "$needed" ]; then
+ has_enough=0
+ fi
+ fi
+
+ local annotated_entry="${name}|${cores}|${mem}|${fam}|${avail}|${limit}|${has_enough}"
+ annotated+=("$annotated_entry")
+
+ # Keep SKUs with unknown quota family (can't verify, don't hide newer
+ # SKUs), SKUs with enough quota, or the configured default.
+ if [ "$family_known" = "0" ] || [ "$has_enough" = "1" ]; then
+ kept+=("$annotated_entry")
+ elif [ "$name" = "$default_sku" ]; then
+ kept+=("$annotated_entry")
+ fi
+ done
+
+ # Fall back to the unfiltered list if filtering emptied it
+ if [ "${#kept[@]}" -eq 0 ]; then
+ _list=("${annotated[@]}")
+ else
+ _list=("${kept[@]}")
+ fi
+ return 0
+}
+
# Checks GPU quota, prints formatted status, sets ASSERT_QUOTA_RESULT.
ASSERT_QUOTA_RESULT=""
assert_vm_quota() {
@@ -343,20 +469,25 @@ resolve_model_quota() {
log_info "Checking quota for $model_type in $location..."
local model_info
+ # Query limit + current as TSV (two tab-separated fields, explicit order) —
+ # avoids fragile awk JSON parsing.
model_info=$(az cognitiveservices usage list --location "$location" \
- --query "[?name.value=='$model_type']" --output json 2>/dev/null | tr '[:upper:]' '[:lower:]')
+ --query "[?name.value=='$model_type'] | [0].{l:limit, u:currentValue}" \
+ --output tsv 2>/dev/null | tr -d '\r')
- if [ -z "$model_info" ] || [ "$model_info" = "[]" ]; then
+ if [ -z "$model_info" ]; then
log_warning "No quota info found for '$model_type' in '$location'. Skipping quota check."
log_info "The model may not be available in this region. Bicep will report a clearer error if so."
return 0
fi
local current_value limit
- current_value=$(echo "$model_info" | awk -F': ' '/"currentvalue"/ {print $2}' | tr -d ',' | tr -d ' ')
- limit=$(echo "$model_info" | awk -F': ' '/"limit"/ {print $2}' | tr -d ',' | tr -d ' ')
- current_value=$(echo "${current_value:-0}" | cut -d'.' -f1)
- limit=$(echo "${limit:-0}" | cut -d'.' -f1)
+ limit=$(echo "$model_info" | cut -f1)
+ current_value=$(echo "$model_info" | cut -f2)
+ current_value=$(echo "${current_value:-0}" | cut -d'.' -f1 | tr -dc '0-9')
+ limit=$(echo "${limit:-0}" | cut -d'.' -f1 | tr -dc '0-9')
+ current_value=${current_value:-0}
+ limit=${limit:-0}
local available=$((limit - current_value))
write_key_value "Model" "$model_type"
@@ -398,7 +529,36 @@ resolve_model_quota() {
# ── Interactive VM Selection Menu ─────────────────────────────────────────
# NOTE: This function uses low-level terminal manipulation for interactive
-# arrow-key menus. Its internal styling is intentionally left as-is.
+# arrow-key menus when a capable terminal is available. When running under
+# azd on Windows (pseudo-terminal without raw-mode support) it falls back to
+# a simple numbered-list menu that only needs basic line input.
+
+# Detect whether the terminal supports the interactive arrow-key menu.
+# On Windows (MSYS / Git Bash / Cygwin) the pseudo-terminal that azd provides
+# passes the stty probe but still breaks on read -rsn1, so we always fall back
+# to the simple numbered-list menu there.
+_TERM_SUPPORTS_RAW=""
+_test_terminal_raw_mode() {
+ if [ -n "$_TERM_SUPPORTS_RAW" ]; then return "$_TERM_SUPPORTS_RAW"; fi
+ _TERM_SUPPORTS_RAW=1 # assume unsupported
+
+ # Windows pseudo-terminals (MSYS, Cygwin) don't reliably support raw reads
+ case "$OSTYPE" in
+ msys*|cygwin*|mingw*) _TERM_SUPPORTS_RAW=1; return 1 ;;
+ esac
+
+ if [ -t 0 ] && [ -t 1 ]; then
+ local _old
+ _old=$(stty -g 2>/dev/null) || { _TERM_SUPPORTS_RAW=1; return 1; }
+ if stty raw -echo min 0 time 1 2>/dev/null; then
+ stty "$_old" 2>/dev/null
+ _TERM_SUPPORTS_RAW=0
+ else
+ stty "$_old" 2>/dev/null || true
+ fi
+ fi
+ return "$_TERM_SUPPORTS_RAW"
+}
SELECTED_VM_SKU=""
SELECTED_VM_CORES=0
@@ -407,16 +567,14 @@ SELECTED_VM_FAMILY=""
show_vm_selection_menu() {
local pool_name="$1"
local env_var_name="$2"
- local -n _vm_array=$3
+ local _vm_array_name=$3
local default_sku="$4"
local location="$5"
local max_nodes="${6:-1}"
local is_gpu="${7:-}"
- local vm_count=${#_vm_array[@]}
- local item_count=$((vm_count + 1))
- local max_visible=20
- [ "$item_count" -lt "$max_visible" ] && max_visible=$item_count
+ local -n _vm_array_ref=$_vm_array_name
+ local vm_count=${#_vm_array_ref[@]}
if [ "$vm_count" -eq 0 ]; then
log_error "No VM sizes available for this pool."
@@ -424,13 +582,172 @@ show_vm_selection_menu() {
exit 1
fi
- # Parse into parallel arrays for fast indexed access
- local -a vm_names vm_cores vm_mem
- for entry in "${_vm_array[@]}"; do
- vm_names+=("${entry%%|*}")
- local rest="${entry#*|}"
- vm_cores+=("${rest%%|*}")
- vm_mem+=("${rest#*|}")
+ if _test_terminal_raw_mode; then
+ _show_vm_menu_interactive "$pool_name" "$env_var_name" "$_vm_array_name" "$default_sku" "$location" "$max_nodes" "$is_gpu"
+ else
+ _show_vm_menu_simple "$pool_name" "$env_var_name" "$_vm_array_name" "$default_sku" "$location" "$max_nodes" "$is_gpu"
+ fi
+}
+
+# ── Simple numbered-list fallback (works without raw terminal) ────────────
+_show_vm_menu_simple() {
+ local pool_name="$1"
+ local env_var_name="$2"
+ local -n _svm_array=$3
+ local default_sku="$4"
+ local location="$5"
+ local max_nodes="${6:-1}"
+ local is_gpu="${7:-}"
+
+ local vm_count=${#_svm_array[@]}
+
+ # Parse into parallel arrays
+ local -a vm_names vm_cores vm_mem vm_family vm_avail vm_limit vm_has_enough
+ local has_quota_info=0
+ for entry in "${_svm_array[@]}"; do
+ IFS='|' read -r -a _f <<< "$entry"
+ vm_names+=("${_f[0]}")
+ vm_cores+=("${_f[1]}")
+ vm_mem+=("${_f[2]}")
+ vm_family+=("${_f[3]:-}")
+ vm_avail+=("${_f[4]:-}")
+ vm_limit+=("${_f[5]:-}")
+ vm_has_enough+=("${_f[6]:-1}")
+ if [ -n "${_f[4]:-}" ] || [ -n "${_f[5]:-}" ]; then
+ has_quota_info=1
+ fi
+ done
+
+ # Find default index
+ local default_index=0
+ for ((i=0; i/dev/null || \
+ log_warning "Could not persist ${env_var_name} via 'azd env set'."
+
+ log_success "Selected: $selected_sku"
+ echo ""
+ SELECTED_VM_SKU="$selected_sku"
+ SELECTED_VM_CORES="$selected_cores"
+ SELECTED_VM_FAMILY="$selected_family"
+ return 0
+ done
+}
+
+# ── Interactive arrow-key menu (requires raw terminal support) ────────────
+_show_vm_menu_interactive() {
+ local pool_name="$1"
+ local env_var_name="$2"
+ local -n _ivm_array=$3
+ local default_sku="$4"
+ local location="$5"
+ local max_nodes="${6:-1}"
+ local is_gpu="${7:-}"
+
+ local vm_count=${#_ivm_array[@]}
+ local item_count=$((vm_count + 1))
+ local max_visible=20
+ [ "$item_count" -lt "$max_visible" ] && max_visible=$item_count
+
+ # Parse into parallel arrays for fast indexed access.
+ # Entries are either "name|cores|memGB" (legacy), "name|cores|memGB|family",
+ # or quota-annotated "name|cores|memGB|family|avail|limit|hasEnough".
+ local -a vm_names vm_cores vm_mem vm_family vm_avail vm_limit vm_has_enough
+ local has_quota_info=0
+ for entry in "${_ivm_array[@]}"; do
+ IFS='|' read -r -a _f <<< "$entry"
+ vm_names+=("${_f[0]}")
+ vm_cores+=("${_f[1]}")
+ vm_mem+=("${_f[2]}")
+ vm_family+=("${_f[3]:-}")
+ vm_avail+=("${_f[4]:-}")
+ vm_limit+=("${_f[5]:-}")
+ vm_has_enough+=("${_f[6]:-1}")
+ if [ -n "${_f[4]:-}" ] || [ -n "${_f[5]:-}" ]; then
+ has_quota_info=1
+ fi
done
# Find default index, set initial scroll
@@ -465,7 +782,18 @@ show_vm_selection_menu() {
[ "${vm_names[$idx]}" = "$default_sku" ] && is_def="1"
local text tag=""
[ "$is_def" = "1" ] && tag=" (default)"
- text=$(printf "%-35s %-10s %-8s%s" "${vm_names[$idx]}" "${vm_cores[$idx]} vCPUs" "${vm_mem[$idx]} GB" "$tag")
+
+ # Quota column
+ local qcol=""
+ if [ "$has_quota_info" = "1" ]; then
+ if [ -n "${vm_avail[$idx]}" ]; then
+ qcol=$(printf "%-14s" "${vm_avail[$idx]} free")
+ else
+ qcol=$(printf "%-14s" "quota n/a")
+ fi
+ fi
+
+ text=$(printf "%-35s %-10s %-8s %s%s" "${vm_names[$idx]}" "${vm_cores[$idx]} vCPUs" "${vm_mem[$idx]} GB" "$qcol" "$tag")
if [ "$idx" -eq "$selected" ]; then
printf "\033[K \033[30;46m > %s \033[0m\n" "$text"
elif [ "$is_def" = "1" ]; then
@@ -483,23 +811,32 @@ show_vm_selection_menu() {
echo ""
write_section "Select VM size for ${pool_name} (${vm_count} sizes available)"
log_info "Use ↑/↓ to move, Enter to select, C custom, Esc cancel"
+ if [ "$has_quota_info" = "1" ]; then
+ log_info "Quota column shows cores free in this region (pool max nodes: ${max_nodes})."
+ fi
echo ""
- # Get cursor row via ANSI DSR
- local cursor_row
- if [ -t 0 ]; then
- local old_stty; old_stty=$(stty -g)
- stty raw -echo min 0
- printf "\033[6n" > /dev/tty
- local response=""
- while true; do
- local ch; ch=$(dd bs=1 count=1 2>/dev/null)
- response="${response}${ch}"
- case "$response" in *R) break ;; esac
- done
- stty "$old_stty"
- cursor_row=$(echo "$response" | sed 's/.*\[//;s/;.*//')
- else
+ # Get cursor row via ANSI DSR.
+ # The stty/dd sequence can fail on Windows pseudo-terminals or when azd
+ # pipes stdin, so guard against set -e by using a subshell with || true.
+ local cursor_row=""
+ if [ -t 0 ] && [ -t 1 ]; then
+ cursor_row=$(
+ old_stty=$(stty -g 2>/dev/null) || true
+ stty raw -echo min 0 2>/dev/null || true
+ printf "\033[6n" > /dev/tty 2>/dev/null || true
+ resp=""
+ for _i in $(seq 1 20); do
+ ch=$(dd bs=1 count=1 2>/dev/null) || break
+ resp="${resp}${ch}"
+ case "$resp" in *R) break ;; esac
+ done
+ [ -n "$old_stty" ] && stty "$old_stty" 2>/dev/null || true
+ echo "$resp" | sed 's/.*\[//;s/;.*//'
+ ) 2>/dev/null || true
+ fi
+ # Fall back if DSR failed or returned garbage
+ if ! [[ "$cursor_row" =~ ^[0-9]+$ ]]; then
cursor_row=10
fi
@@ -557,15 +894,23 @@ show_vm_selection_menu() {
# GPU quota check
if [ "$is_gpu" = "gpu" ]; then
_write_log_message "Resolving quota family..." "$SYM_INFO" "$C_ACCENT" "$C_TEXT" false true
- get_quota_family_for_vm "$selected_sku" "$location"
+ local sku_family=""
+ for ((i=0; i$null | ConvertFrom-Json -ErrorAction SilentlyContinue
+ $installed = Invoke-AzJson { az extension show --name $ext }
if (-not $installed) {
Log-Info "Installing Azure CLI extension: $ext..."
az extension add --name $ext --yes 2>$null
@@ -49,9 +51,9 @@ foreach ($ext in @('connectedk8s', 'k8s-extension')) {
# =====================================================
# Authenticate and set subscription
# =====================================================
-$EXPIRED_TOKEN = (az ad signed-in-user show --query 'id' -o tsv 2>$null)
+$SIGNED_IN_USER_ID = (az ad signed-in-user show --query 'id' -o tsv 2>$null)
-if (-not $EXPIRED_TOKEN) {
+if (-not $SIGNED_IN_USER_ID) {
Log-Warning "No Azure user signed in. Please login."
az login -o none
}
@@ -96,7 +98,10 @@ Log-Success "NVIDIA GPU Operator installed"
# Step 3: Connect AKS to Azure Arc
# =====================================================
$ARC_CLUSTER_NAME = "${ARC_CLUSTER_PREFIX}$env:AZURE_AKS_CLUSTER_NAME"
-# Defensive clamp: Azure Arc cluster names must be <= 63 chars
+# Azure Arc cluster name rules: lowercase alnum + hyphen, no leading/trailing hyphen, <=63 chars
+$ARC_CLUSTER_NAME = $ARC_CLUSTER_NAME.ToLower() -replace '[^a-z0-9-]', '-'
+$ARC_CLUSTER_NAME = $ARC_CLUSTER_NAME -replace '-+', '-'
+$ARC_CLUSTER_NAME = $ARC_CLUSTER_NAME.Trim('-')
if ($ARC_CLUSTER_NAME.Length -gt 63) {
$ARC_CLUSTER_NAME = $ARC_CLUSTER_NAME.Substring(0, 63).TrimEnd('-')
}
@@ -129,7 +134,7 @@ else {
}
# Save the Arc cluster name to azd env
-azd env set AZURE_ARC_CLUSTER_NAME "$ARC_CLUSTER_NAME"
+[void](Invoke-AzdEnvSet -Name 'AZURE_ARC_CLUSTER_NAME' -Value "$ARC_CLUSTER_NAME")
# =====================================================
# Step 4: Create Public IP and construct Endpoint URI
@@ -147,7 +152,16 @@ if ($env:AZURE_DNS_LABEL) {
Log-Info "Reusing existing DNS label: $DNS_LABEL"
}
else {
- $RANDOM_SUFFIX = Get-Random -Minimum 100 -Maximum 1000
+ # Deterministic 5-digit suffix derived from env + location + subscription.
+ # Same inputs always produce same DNS label (idempotent) while collision
+ # space is 10^5 (vs 900 for the prior Get-Random 100..999).
+ $dnsHashInput = "$($env:AZURE_ENV_NAME)|$($env:AZURE_LOCATION)|$($env:AZURE_SUBSCRIPTION_ID)"
+ $sha = [System.Security.Cryptography.SHA256]::Create()
+ $bytes = $sha.ComputeHash([System.Text.Encoding]::UTF8.GetBytes($dnsHashInput))
+ $sha.Dispose()
+ # Take first 4 bytes as uint32, mod 100000 for a 5-digit decimal suffix
+ $uint = [System.BitConverter]::ToUInt32($bytes, 0)
+ $RANDOM_SUFFIX = ('{0:D5}' -f ($uint % 100000))
$DNS_LABEL = "$($env:AZURE_ENV_NAME)$RANDOM_SUFFIX"
Log-Info "Generated DNS label: $DNS_LABEL"
}
@@ -192,9 +206,9 @@ $VIDEO_INDEXER_ENDPOINT_URI = "https://${DNS_LABEL}.$($env:AZURE_LOCATION).cloud
Write-KeyValue "Endpoint URI" $VIDEO_INDEXER_ENDPOINT_URI
# Persist to azd env
-azd env set AZURE_DNS_LABEL "$DNS_LABEL"
-azd env set AZURE_STATIC_IP "$STATIC_IP"
-azd env set AZURE_VIDEO_INDEXER_ENDPOINT_URI "$VIDEO_INDEXER_ENDPOINT_URI"
+[void](Invoke-AzdEnvSet -Name 'AZURE_DNS_LABEL' -Value "$DNS_LABEL")
+[void](Invoke-AzdEnvSet -Name 'AZURE_STATIC_IP' -Value "$STATIC_IP")
+[void](Invoke-AzdEnvSet -Name 'AZURE_VIDEO_INDEXER_ENDPOINT_URI' -Value "$VIDEO_INDEXER_ENDPOINT_URI")
# =====================================================
# Step 5: Enable App Routing (HTTP only)
@@ -294,7 +308,9 @@ Log-Success "Cert Manager extension deployed"
# =====================================================
Log-Step -Number 9 -Total $totalSteps -Title "Deploying Video Indexer Arc Extension"
-$inferenceAgentEnabled = if ($env:CREATE_FOUNDRY_PROJECT -eq 'true') { 'false' } else { 'true' }
+# Normalize to true by default; only explicit "false" disables Foundry.
+$createFoundry = if ($env:CREATE_FOUNDRY_PROJECT -eq 'false') { 'false' } else { 'true' }
+$inferenceAgentEnabled = if ($createFoundry -eq 'false') { 'true' } else { 'false' }
$mediaStreamerEnabled = if ($env:MEDIA_STREAMER_ENABLED -eq 'false') { 'false' } else { 'true' }
Log-Info "Deploying VI Arc extension (this may take several minutes)..."
@@ -308,8 +324,8 @@ az deployment group create `
videoIndexerEndpointUri="$VIDEO_INDEXER_ENDPOINT_URI" `
deepstreamNodeSelectorValue="$env:AZURE_DEEPSTREAM_NODE_SELECTOR_VALUE" `
inferenceNodeSelectorValue="$env:AZURE_INFERENCE_NODE_SELECTOR_VALUE" `
- inferenceAgentEnabled=$inferenceAgentEnabled `
- mediaStreamerEnabled=$mediaStreamerEnabled
+ inferenceAgentEnabled="$inferenceAgentEnabled" `
+ mediaStreamerEnabled="$mediaStreamerEnabled"
Log-Success "Video Indexer Arc extension deployed"
diff --git a/hooks/postprovision.sh b/hooks/postprovision.sh
index 2f01ab6..e7c0be8 100755
--- a/hooks/postprovision.sh
+++ b/hooks/postprovision.sh
@@ -4,8 +4,7 @@
# Post-Provision Script: Connect AKS to Azure Arc and deploy VI extension
# =============================================================================
-set -e
-
+set -eo pipefail
source "$(dirname "$0")/common.sh"
TOTAL_STEPS=12
@@ -13,6 +12,8 @@ TOTAL_STEPS=12
write_foundry_banner "Post-Provision Setup"
if [ "$CREATE_IN_LOCAL" = "false" ]; then
+ # CREATE_IN_LOCAL=false is set by CI workflows that provision infra via
+ # dedicated pipelines and do not want the local azd hook to run.
log_info "Skipping postprovision script for non-local deployment."
exit 0
fi
@@ -43,9 +44,9 @@ done
# =====================================================
# Authenticate and set subscription
# =====================================================
-EXPIRED_TOKEN=$(az ad signed-in-user show --query 'id' -o tsv 2>/dev/null || true)
+SIGNED_IN_USER_ID=$(az ad signed-in-user show --query 'id' -o tsv 2>/dev/null | tr -d '\r' || true)
-if [ -z "$EXPIRED_TOKEN" ]; then
+if [ -z "$SIGNED_IN_USER_ID" ]; then
log_warning "No Azure user signed in. Please login."
az login -o none
fi
@@ -90,9 +91,12 @@ log_success "NVIDIA GPU Operator installed"
# Step 3: Connect AKS to Azure Arc
# =====================================================
ARC_CLUSTER_NAME="${ARC_CLUSTER_PREFIX}${AZURE_AKS_CLUSTER_NAME}"
-# Defensive clamp: Azure Arc cluster names must be <= 63 chars
+# Azure Arc cluster name rules: lowercase alnum + hyphen, no leading/trailing hyphen, <=63 chars
+ARC_CLUSTER_NAME=$(echo "$ARC_CLUSTER_NAME" | tr '[:upper:]' '[:lower:]' | tr -c 'a-z0-9-' '-')
+ARC_CLUSTER_NAME=$(echo "$ARC_CLUSTER_NAME" | sed -E 's/-+/-/g; s/^-+//; s/-+$//')
if [ ${#ARC_CLUSTER_NAME} -gt 63 ]; then
- ARC_CLUSTER_NAME=$(echo "${ARC_CLUSTER_NAME:0:63}" | sed 's/-$//')
+ ARC_CLUSTER_NAME="${ARC_CLUSTER_NAME:0:63}"
+ ARC_CLUSTER_NAME=$(echo "$ARC_CLUSTER_NAME" | sed -E 's/-+$//')
fi
log_step 3 $TOTAL_STEPS "Connecting AKS to Azure Arc"
@@ -102,7 +106,7 @@ write_key_value "Arc cluster name" "$ARC_CLUSTER_NAME"
ARC_EXISTS=$(az connectedk8s show \
--name "$ARC_CLUSTER_NAME" \
--resource-group "$AZURE_RESOURCE_GROUP" \
- --query "name" -o tsv 2>/dev/null || true)
+ --query "name" -o tsv 2>/dev/null | tr -d '\r' || true)
if [ -n "$ARC_EXISTS" ]; then
log_success "Arc-connected cluster already exists. Skipping."
@@ -116,7 +120,7 @@ else
fi
# Save the Arc cluster name to azd env
-azd env set AZURE_ARC_CLUSTER_NAME "$ARC_CLUSTER_NAME"
+azd_env_set AZURE_ARC_CLUSTER_NAME "$ARC_CLUSTER_NAME" || true
# =====================================================
# Step 4: Create Public IP and construct Endpoint URI
@@ -133,7 +137,18 @@ if [ -n "$AZURE_DNS_LABEL" ]; then
DNS_LABEL="$AZURE_DNS_LABEL"
log_info "Reusing existing DNS label: $DNS_LABEL"
else
- RANDOM_SUFFIX=$((RANDOM % 900 + 100))
+ # Deterministic 5-digit suffix derived from env + location + subscription.
+ # Same inputs always produce same DNS label (idempotent) while collision
+ # space is 10^5 (vs 900 for the prior RANDOM % 900 + 100).
+ DNS_HASH_INPUT="${AZURE_ENV_NAME}|${AZURE_LOCATION}|${AZURE_SUBSCRIPTION_ID:-}"
+ if command -v sha256sum >/dev/null 2>&1; then
+ DNS_HASH=$(printf '%s' "$DNS_HASH_INPUT" | sha256sum | cut -c1-10)
+ else
+ DNS_HASH=$(printf '%s' "$DNS_HASH_INPUT" | shasum -a 256 | cut -c1-10)
+ fi
+ # Convert hex slice to decimal and take 5 digits
+ RANDOM_SUFFIX=$(printf '%d' "0x${DNS_HASH}" 2>/dev/null | tr -dc '0-9' | head -c 5)
+ RANDOM_SUFFIX=${RANDOM_SUFFIX:-00000}
DNS_LABEL="${AZURE_ENV_NAME}${RANDOM_SUFFIX}"
log_info "Generated DNS label: $DNS_LABEL"
fi
@@ -144,7 +159,7 @@ PUBLIC_IP_NAME="${AZURE_ENV_NAME}-inbound-ip"
PUBLIC_IP_EXISTS=$(az network public-ip show \
--resource-group "$AKS_MC_RG" \
--name "$PUBLIC_IP_NAME" \
- --query "name" -o tsv 2>/dev/null || true)
+ --query "name" -o tsv 2>/dev/null | tr -d '\r' || true)
if [ -n "$PUBLIC_IP_EXISTS" ]; then
log_success "Public IP '$PUBLIC_IP_NAME' already exists. Skipping."
@@ -163,7 +178,7 @@ fi
STATIC_IP=$(az network public-ip show \
--resource-group "$AKS_MC_RG" \
--name "$PUBLIC_IP_NAME" \
- --query "ipAddress" -o tsv)
+ --query "ipAddress" -o tsv | tr -d '\r')
write_key_value "Static IP" "$STATIC_IP"
@@ -171,9 +186,9 @@ VIDEO_INDEXER_ENDPOINT_URI="https://${DNS_LABEL}.${AZURE_LOCATION}.cloudapp.azur
write_key_value "Endpoint URI" "$VIDEO_INDEXER_ENDPOINT_URI"
# Persist to azd env
-azd env set AZURE_DNS_LABEL "${DNS_LABEL}"
-azd env set AZURE_STATIC_IP "${STATIC_IP}"
-azd env set AZURE_VIDEO_INDEXER_ENDPOINT_URI "${VIDEO_INDEXER_ENDPOINT_URI}"
+azd_env_set AZURE_DNS_LABEL "${DNS_LABEL}" || true
+azd_env_set AZURE_STATIC_IP "${STATIC_IP}" || true
+azd_env_set AZURE_VIDEO_INDEXER_ENDPOINT_URI "${VIDEO_INDEXER_ENDPOINT_URI}" || true
# =====================================================
# Step 5: Enable App Routing (HTTP only)
@@ -183,7 +198,7 @@ log_step 5 $TOTAL_STEPS "Enabling App Routing on AKS Cluster"
APPROUTING_ENABLED=$(az aks show \
--resource-group "$AZURE_RESOURCE_GROUP" \
--name "$AZURE_AKS_CLUSTER_NAME" \
- --query "ingressProfile.webAppRouting.enabled" -o tsv 2>/dev/null || true)
+ --query "ingressProfile.webAppRouting.enabled" -o tsv 2>/dev/null | tr -d '\r' || true)
if [ "$APPROUTING_ENABLED" = "true" ]; then
log_success "App Routing already enabled. Skipping."
@@ -258,10 +273,12 @@ log_success "Cert Manager extension deployed"
# =====================================================
log_step 9 $TOTAL_STEPS "Deploying Video Indexer Arc Extension"
-if [ "$CREATE_FOUNDRY_PROJECT" = "true" ]; then
- INFERENCE_AGENT_ENABLED="false"
-else
+# Normalize to true by default; only explicit "false" disables Foundry.
+CREATE_FOUNDRY_PROJECT="${CREATE_FOUNDRY_PROJECT:-true}"
+if [ "$CREATE_FOUNDRY_PROJECT" = "false" ]; then
INFERENCE_AGENT_ENABLED="true"
+else
+ INFERENCE_AGENT_ENABLED="false"
fi
MEDIA_STREAMER_ENABLED="${MEDIA_STREAMER_ENABLED:-true}"
@@ -277,8 +294,8 @@ az deployment group create \
videoIndexerEndpointUri="$VIDEO_INDEXER_ENDPOINT_URI" \
deepstreamNodeSelectorValue="$AZURE_DEEPSTREAM_NODE_SELECTOR_VALUE" \
inferenceNodeSelectorValue="$AZURE_INFERENCE_NODE_SELECTOR_VALUE" \
- inferenceAgentEnabled=$INFERENCE_AGENT_ENABLED \
- mediaStreamerEnabled=$MEDIA_STREAMER_ENABLED
+ inferenceAgentEnabled="$INFERENCE_AGENT_ENABLED" \
+ mediaStreamerEnabled="$MEDIA_STREAMER_ENABLED"
log_success "Video Indexer Arc extension deployed"
log_info "Assigning permissions to Arc extension managed identity..."
@@ -287,7 +304,7 @@ PRINCIPAL_ID=$(az k8s-extension show \
--cluster-name "$ARC_CLUSTER_NAME" \
--cluster-type connectedClusters \
--name videoindexer \
- --query "identity.principalId" -o tsv 2>/dev/null || true)
+ --query "identity.principalId" -o tsv 2>/dev/null | tr -d '\r' || true)
ACCOUNT_RESOURCE_ID="$AZURE_VIDEO_INDEXER_ACCOUNT_RESOURCE_ID"
@@ -305,7 +322,7 @@ else
--assignee "$PRINCIPAL_ID" \
--role Contributor \
--scope "$ACCOUNT_RESOURCE_ID" \
- --query "[0].id" -o tsv 2>/dev/null || true)
+ --query "[0].id" -o tsv 2>/dev/null | tr -d '\r' || true)
if [ -n "$EXISTING_ASSIGNMENT" ]; then
log_success "Role assignment already exists. Skipping."
@@ -332,7 +349,7 @@ VI_EXT_STATE=$(az k8s-extension show \
--cluster-name "$ARC_CLUSTER_NAME" \
--cluster-type connectedClusters \
--name videoindexer \
- --query "provisioningState" -o tsv 2>/dev/null || echo "Unknown")
+ --query "provisioningState" -o tsv 2>/dev/null | tr -d '\r' || echo "Unknown")
# =====================================================
# Acquire VI Extension Access Token (used by Steps 10-11)
@@ -351,7 +368,7 @@ else
--cluster-name "$ARC_CLUSTER_NAME" \
--cluster-type connectedClusters \
--name videoindexer \
- --query "id" -o tsv 2>/dev/null || true)
+ --query "id" -o tsv 2>/dev/null | tr -d '\r' || true)
if [ -z "$EXTENSION_ID" ]; then
log_warning "Failed to retrieve VI extension ID. Skipping camera and agent job setup."
@@ -362,7 +379,7 @@ else
VI_ACCESS_TOKEN=$(az rest --method post --url "$TOKEN_URL" \
--body "$TOKEN_BODY" \
- --query "accessToken" -o tsv 2>/dev/null || true)
+ --query "accessToken" -o tsv 2>/dev/null | tr -d '\r' || true)
if [ -z "$VI_ACCESS_TOKEN" ]; then
log_warning "Failed to generate VI extension access token. Skipping camera and agent job setup."
@@ -459,7 +476,7 @@ log_step 12 $TOTAL_STEPS "Running Post-Deployment Health Checks"
ARC_STATUS=$(az connectedk8s show \
--name "$ARC_CLUSTER_NAME" \
--resource-group "$AZURE_RESOURCE_GROUP" \
- --query "connectivityStatus" -o tsv 2>/dev/null || echo "unknown")
+ --query "connectivityStatus" -o tsv 2>/dev/null | tr -d '\r' || echo "unknown")
ARC_HEALTH="Pass"; [ "$ARC_STATUS" != "Connected" ] && ARC_HEALTH="Warn"
write_health_row "Arc connection" "$ARC_HEALTH" "$ARC_STATUS"
diff --git a/hooks/postup.ps1 b/hooks/postup.ps1
index d8f9335..e99c241 100644
--- a/hooks/postup.ps1
+++ b/hooks/postup.ps1
@@ -64,7 +64,7 @@ if ($HasClusterAccess) {
$HealthResults += @{ Name = "Cluster nodes"; Status = $nodeStatus; Detail = $nodeDetail }
try {
- $gpuNodes = (kubectl --context $KubeContext get nodes -l "accelerator=nvidia" --no-headers 2>$null | Measure-Object -Line).Lines
+ $gpuNodes = (kubectl --context $KubeContext get nodes -l "nvidia.com/gpu.present=true" --no-headers 2>$null | Measure-Object -Line).Lines
if ($gpuNodes -gt 0) {
$gpuNodeStatus = "Pass"
$gpuNodeDetail = "$gpuNodes detected"
diff --git a/hooks/postup.sh b/hooks/postup.sh
index ed9e758..310295e 100755
--- a/hooks/postup.sh
+++ b/hooks/postup.sh
@@ -4,8 +4,7 @@
# Post-Up Script: Deployment health dashboard and next steps
# =============================================================================
-set -e
-
+set -eo pipefail
source "$(dirname "$0")/common.sh"
TOTAL_STEPS=6
@@ -51,18 +50,21 @@ log_step 1 $TOTAL_STEPS "AKS Cluster Health"
if [ "$HAS_CLUSTER_ACCESS" = "true" ]; then
AKS_STATE=$(az aks show -g "$AZURE_RESOURCE_GROUP" -n "$AZURE_AKS_CLUSTER_NAME" \
- --query "provisioningState" -o tsv 2>/dev/null || echo "Unknown")
+ --query "provisioningState" -o tsv 2>/dev/null | tr -d '\r' || echo "Unknown")
AKS_STATUS="Pass"; [ "$AKS_STATE" != "Succeeded" ] && AKS_STATUS="Fail"
write_health_row "AKS provisioning state" "$AKS_STATUS" "$AKS_STATE"
_track_health "$AKS_STATUS"
TOTAL_NODES=$(kubectl --context "$KUBE_CONTEXT" get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ')
- READY_NODES=$(kubectl --context "$KUBE_CONTEXT" get nodes --no-headers 2>/dev/null | grep -c " Ready " || echo "0")
+ READY_NODES=$(kubectl --context "$KUBE_CONTEXT" get nodes --no-headers 2>/dev/null | grep -c " Ready " ; true)
+ READY_NODES=$(echo "$READY_NODES" | tr -dc '0-9' | head -c 6)
+ READY_NODES=${READY_NODES:-0}
NODE_STATUS="Pass"; [ "$READY_NODES" != "$TOTAL_NODES" ] && NODE_STATUS="Warn"
write_health_row "Cluster nodes" "$NODE_STATUS" "${READY_NODES}/${TOTAL_NODES} Ready"
_track_health "$NODE_STATUS"
- GPU_NODES=$(kubectl --context "$KUBE_CONTEXT" get nodes -l "accelerator=nvidia" --no-headers 2>/dev/null | wc -l | tr -d ' ' || echo "0")
+ GPU_NODES=$(kubectl --context "$KUBE_CONTEXT" get nodes -l "nvidia.com/gpu.present=true" --no-headers 2>/dev/null | wc -l | tr -d ' ')
+ GPU_NODES=${GPU_NODES:-0}
GPU_NODE_STATUS="Pass"; [ "$GPU_NODES" -eq 0 ] && GPU_NODE_STATUS="Warn"
GPU_NODE_DETAIL="$GPU_NODES detected"
[ "$GPU_NODES" -eq 0 ] && GPU_NODE_DETAIL="none detected (may still be provisioning)"
@@ -107,7 +109,7 @@ if [ -n "$ARC_CLUSTER_NAME" ]; then
ARC_STATUS=$(az connectedk8s show \
--name "$ARC_CLUSTER_NAME" \
--resource-group "$AZURE_RESOURCE_GROUP" \
- --query "connectivityStatus" -o tsv 2>/dev/null || echo "Unknown")
+ --query "connectivityStatus" -o tsv 2>/dev/null | tr -d '\r' || echo "Unknown")
ARC_HEALTH="Pass"; [ "$ARC_STATUS" != "Connected" ] && ARC_HEALTH="Warn"
write_health_row "Arc connection" "$ARC_HEALTH" "$ARC_CLUSTER_NAME ($ARC_STATUS)"
_track_health "$ARC_HEALTH"
@@ -144,8 +146,15 @@ if [ "$HAS_CLUSTER_ACCESS" = "true" ]; then
if [ -n "${AZURE_DNS_LABEL:-}" ] && [ -n "${AZURE_LOCATION:-}" ]; then
FQDN="${AZURE_DNS_LABEL}.${AZURE_LOCATION}.cloudapp.azure.com"
- DNS_RESOLVED=$(host "$FQDN" 2>/dev/null | grep -c "has address" || \
- nslookup "$FQDN" 2>/dev/null | grep -c "Address:" || echo "0")
+ if command -v host >/dev/null 2>&1; then
+ DNS_RESOLVED=$(host "$FQDN" 2>/dev/null | grep -c "has address" ; true)
+ elif command -v nslookup >/dev/null 2>&1; then
+ DNS_RESOLVED=$(nslookup "$FQDN" 2>/dev/null | grep -c "^Address:" ; true)
+ else
+ DNS_RESOLVED=0
+ fi
+ DNS_RESOLVED=$(echo "${DNS_RESOLVED:-0}" | tr -dc '0-9' | head -c 6)
+ DNS_RESOLVED=${DNS_RESOLVED:-0}
if [ "$DNS_RESOLVED" -gt 0 ]; then
write_health_row "DNS resolution" "Pass" "$FQDN"
_track_health "Pass"
@@ -169,7 +178,7 @@ if [ -n "$ARC_CLUSTER_NAME" ]; then
--cluster-name "$ARC_CLUSTER_NAME" \
--cluster-type connectedClusters \
--name videoindexer \
- --query "provisioningState" -o tsv 2>/dev/null || echo "Unknown")
+ --query "provisioningState" -o tsv 2>/dev/null | tr -d '\r' || echo "Unknown")
VI_EXT_STATUS="Pass"; [ "$VI_STATE" != "Succeeded" ] && VI_EXT_STATUS="Warn"
write_health_row "VI Extension" "$VI_EXT_STATUS" "$VI_STATE"
_track_health "$VI_EXT_STATUS"
diff --git a/hooks/predown.ps1 b/hooks/predown.ps1
deleted file mode 100644
index 40e6400..0000000
--- a/hooks/predown.ps1
+++ /dev/null
@@ -1,77 +0,0 @@
-# =============================================================================
-# Pre-Down Cleanup: Remove resources created outside azd's tracked scope
-# =============================================================================
-# The postprovision hook creates a Public IP in the AKS node resource group
-# (MC_ group), which azd down does not track. This hook cleans it up before
-# the resource group is deleted.
-# =============================================================================
-
-$ErrorActionPreference = "Stop"
-
-Write-Host ""
-Write-Host " Pre-down cleanup: removing untracked resources..." -ForegroundColor Cyan
-Write-Host ""
-
-# ── Public IP in the AKS node resource group ────────────────────────────────
-$AKS_MC_RG = $env:AZURE_AKS_NODE_RESOURCE_GROUP
-$PUBLIC_IP_NAME = "$($env:AZURE_ENV_NAME)-inbound-ip"
-
-if (-not $AKS_MC_RG -or -not $env:AZURE_ENV_NAME) {
- Write-Host " [SKIP] Missing AZURE_AKS_NODE_RESOURCE_GROUP or AZURE_ENV_NAME. Nothing to clean up." -ForegroundColor Yellow
- exit 0
-}
-
-$exists = $null
-try {
- $exists = (az network public-ip show `
- --resource-group "$AKS_MC_RG" `
- --name "$PUBLIC_IP_NAME" `
- --query "name" -o tsv 2>$null)
-}
-catch {
- $exists = $null
-}
-
-if ($exists) {
- Write-Host " Deleting public IP '$PUBLIC_IP_NAME' from '$AKS_MC_RG'..."
- az network public-ip delete `
- --resource-group "$AKS_MC_RG" `
- --name "$PUBLIC_IP_NAME" 2>$null
- Write-Host " [OK] Public IP deleted." -ForegroundColor Green
-}
-else {
- Write-Host " [SKIP] Public IP '$PUBLIC_IP_NAME' not found in '$AKS_MC_RG'. Nothing to clean up." -ForegroundColor Yellow
-}
-
-# ── Arc-connected cluster ────────────────────────────────────────────────────
-$ARC_CLUSTER_NAME = $env:AZURE_ARC_CLUSTER_NAME
-$RESOURCE_GROUP = $env:AZURE_RESOURCE_GROUP
-
-if ($ARC_CLUSTER_NAME -and $RESOURCE_GROUP) {
- $arcExists = $null
- try {
- $arcExists = (az connectedk8s show `
- --name "$ARC_CLUSTER_NAME" `
- --resource-group "$RESOURCE_GROUP" `
- --query "name" -o tsv 2>$null)
- }
- catch {
- $arcExists = $null
- }
-
- if ($arcExists) {
- Write-Host " Disconnecting Arc cluster '$ARC_CLUSTER_NAME'..."
- az connectedk8s delete `
- --name "$ARC_CLUSTER_NAME" `
- --resource-group "$RESOURCE_GROUP" `
- --yes 2>$null
- Write-Host " [OK] Arc cluster disconnected." -ForegroundColor Green
- }
- else {
- Write-Host " [SKIP] Arc cluster '$ARC_CLUSTER_NAME' not found. Nothing to clean up." -ForegroundColor Yellow
- }
-}
-
-Write-Host ""
-Write-Host " Pre-down cleanup complete." -ForegroundColor Green
-Write-Host ""
diff --git a/hooks/predown.sh b/hooks/predown.sh
deleted file mode 100755
index df3e4da..0000000
--- a/hooks/predown.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-
-# =============================================================================
-# Pre-Down Cleanup: Remove resources created outside azd's tracked scope
-# =============================================================================
-# The postprovision hook creates a Public IP in the AKS node resource group
-# (MC_ group), which azd down does not track. This hook cleans it up before
-# the resource group is deleted.
-# =============================================================================
-
-set -e
-
-echo ""
-echo " Pre-down cleanup: removing untracked resources..."
-echo ""
-
-# ── Public IP in the AKS node resource group ────────────────────────────────
-AKS_MC_RG="${AZURE_AKS_NODE_RESOURCE_GROUP}"
-PUBLIC_IP_NAME="${AZURE_ENV_NAME}-inbound-ip"
-
-if [ -z "$AKS_MC_RG" ] || [ -z "$AZURE_ENV_NAME" ]; then
- echo " [SKIP] Missing AZURE_AKS_NODE_RESOURCE_GROUP or AZURE_ENV_NAME. Nothing to clean up."
- exit 0
-fi
-
-IP_EXISTS=$(az network public-ip show \
- --resource-group "$AKS_MC_RG" \
- --name "$PUBLIC_IP_NAME" \
- --query "name" -o tsv 2>/dev/null || true)
-
-if [ -n "$IP_EXISTS" ]; then
- echo " Deleting public IP '$PUBLIC_IP_NAME' from '$AKS_MC_RG'..."
- az network public-ip delete \
- --resource-group "$AKS_MC_RG" \
- --name "$PUBLIC_IP_NAME" 2>/dev/null
- echo " [OK] Public IP deleted."
-else
- echo " [SKIP] Public IP '$PUBLIC_IP_NAME' not found in '$AKS_MC_RG'. Nothing to clean up."
-fi
-
-# ── Arc-connected cluster ────────────────────────────────────────────────────
-ARC_CLUSTER_NAME="${AZURE_ARC_CLUSTER_NAME}"
-RESOURCE_GROUP="${AZURE_RESOURCE_GROUP}"
-
-if [ -n "$ARC_CLUSTER_NAME" ] && [ -n "$RESOURCE_GROUP" ]; then
- ARC_EXISTS=$(az connectedk8s show \
- --name "$ARC_CLUSTER_NAME" \
- --resource-group "$RESOURCE_GROUP" \
- --query "name" -o tsv 2>/dev/null || true)
-
- if [ -n "$ARC_EXISTS" ]; then
- echo " Disconnecting Arc cluster '$ARC_CLUSTER_NAME'..."
- az connectedk8s delete \
- --name "$ARC_CLUSTER_NAME" \
- --resource-group "$RESOURCE_GROUP" \
- --yes 2>/dev/null
- echo " [OK] Arc cluster disconnected."
- else
- echo " [SKIP] Arc cluster '$ARC_CLUSTER_NAME' not found. Nothing to clean up."
- fi
-fi
-
-echo ""
-echo " Pre-down cleanup complete."
-echo ""
diff --git a/hooks/preprovision.ps1 b/hooks/preprovision.ps1
index ec854b7..5c22a2e 100644
--- a/hooks/preprovision.ps1
+++ b/hooks/preprovision.ps1
@@ -41,7 +41,7 @@ if ($env:AZURE_SUBSCRIPTION_ID) {
}
$subName = (az account show --query "name" -o tsv 2>$null)
Log-Success "Subscription: $subName"
- Log-Success "ID" $env:AZURE_SUBSCRIPTION_ID
+ Log-Success "ID: $env:AZURE_SUBSCRIPTION_ID"
}
# =====================================================
@@ -75,14 +75,14 @@ if ($storageSku -eq 'Standard_ZRS') {
Log-Info "Checking ZRS availability in $($env:AZURE_LOCATION)..."
$zrsAvailable = $false
try {
- $skuInfo = (az provider show --namespace Microsoft.Storage `
- --query "resourceTypes[?resourceType=='storageAccounts'].zoneMappings[?contains(location, '$($env:AZURE_LOCATION)')].location | [0][0]" `
+ $skuInfo = (az storage account list-skus --location $env:AZURE_LOCATION `
+ --query "[?name=='Standard_ZRS'].name | [0]" `
-o tsv 2>$null)
- if ($skuInfo) { $zrsAvailable = $true }
+ if (-not [string]::IsNullOrWhiteSpace($skuInfo)) { $zrsAvailable = $true }
}
- catch { }
+ catch { $zrsAvailable = $false }
if (-not $zrsAvailable) {
- Log-Warning "Standard_ZRS may not be available in '$($env:AZURE_LOCATION)'."
+ Log-Warning "Requested Standard_ZRS is not available in '$($env:AZURE_LOCATION)'."
Log-Warning "Falling back to Standard_LRS to avoid deployment failure."
azd env set STORAGE_SKU_NAME Standard_LRS 2>$null
$storageSku = 'Standard_LRS'
@@ -91,7 +91,52 @@ if ($storageSku -eq 'Standard_ZRS') {
Log-Success "ZRS is available in $($env:AZURE_LOCATION)"
}
}
-Write-KeyValue "Storage SKU" $storageSku
+Write-KeyValue "STORAGE_SKU" $storageSku
+
+# ── Validate Kubernetes version against region capabilities ──────────────
+# Pin minors drift from standard support to LTS-only (e.g. 1.32 → LTS).
+# If the requested minor is not available on the standard "KubernetesOfficial"
+# support plan in this region, auto-fall back to the region's default minor.
+$requestedK8s = if ($env:KUBERNETES_VERSION) { $env:KUBERNETES_VERSION } else { '1.34' }
+$versions = Invoke-AzJson { az aks get-versions --location $env:AZURE_LOCATION -o json }
+if ($versions -and $versions.values) {
+ $standardVersions = @($versions.values | Where-Object {
+ $_.capabilities.supportPlan -contains 'KubernetesOfficial'
+ } | ForEach-Object { $_.version })
+ $defaultVersion = ($versions.values | Where-Object { $_.isDefault } | Select-Object -First 1).version
+ if ($standardVersions -notcontains $requestedK8s) {
+ if ($defaultVersion) {
+ Log-Warning "Kubernetes '$requestedK8s' is not on standard support in '$($env:AZURE_LOCATION)'."
+ Log-Warning "Falling back to region default: '$defaultVersion'."
+ [void](Invoke-AzdEnvSet -Name 'KUBERNETES_VERSION' -Value $defaultVersion)
+ $env:KUBERNETES_VERSION = $defaultVersion
+ $requestedK8s = $defaultVersion
+ }
+ else {
+ Log-Warning "Kubernetes '$requestedK8s' is not on standard support and no default could be resolved."
+ }
+ }
+ Write-KeyValue "KUBERNETES_VERSION" $requestedK8s
+}
+else {
+ Log-Warning "Could not query AKS versions in '$($env:AZURE_LOCATION)'. Proceeding with '$requestedK8s'."
+}
+
+# Persist the resolved CREATE_FOUNDRY_PROJECT value so Bicep + all downstream
+# hooks agree on the same boolean. (Bicep's default is true; the hooks default
+# to true to match. Without this step, a step that reads $env:... raw would
+# see the empty string and take the 'false' branch.)
+$resolvedFoundry = if ($CREATE_FOUNDRY_PROJECT) { 'true' } else { 'false' }
+if ($env:CREATE_FOUNDRY_PROJECT -ne $resolvedFoundry) {
+ try {
+ azd env set CREATE_FOUNDRY_PROJECT $resolvedFoundry 2>$null
+ $env:CREATE_FOUNDRY_PROJECT = $resolvedFoundry
+ Write-KeyValue "CREATE_FOUNDRY_PROJECT" $resolvedFoundry
+ }
+ catch {
+ Log-Warning "Could not persist CREATE_FOUNDRY_PROJECT via 'azd env set'."
+ }
+}
# =====================================================
# Step 4: Resolve AI Model Quota (if Foundry enabled)
@@ -124,7 +169,34 @@ if ($allVmSizes.Count -eq 0) {
exit 1
}
-Log-Success "Found $($allVmSizes.Count) VM sizes in $($env:AZURE_LOCATION)"
+$cpuCount = @($allVmSizes | Where-Object { $n = $_.Name; ($CPU_VM_PREFIXES | Where-Object { $n.StartsWith($_) }).Count -gt 0 }).Count
+$gpuCount = @($allVmSizes | Where-Object { $n = $_.Name; ($GPU_VM_PREFIXES | Where-Object { $n.StartsWith($_) }).Count -gt 0 }).Count
+Log-Success "Found VM SKUs in $($env:AZURE_LOCATION)"
+Log-Info "$cpuCount CPU + $gpuCount GPU sizes available"
+
+if ($cpuCount -eq 0 -and $gpuCount -eq 0) {
+ Log-Error "No VM sizes are available in '$($env:AZURE_LOCATION)' for this subscription."
+ Log-Info "This usually means the region has restrictive SKU policies for your subscription."
+ Log-Info "Try a different region: run 'azd env set AZURE_LOCATION ' and re-run 'azd up'."
+ Log-Info "Recommended regions: eastus2, westus3, southcentralus, northeurope"
+ exit 1
+}
+
+if ($cpuCount -eq 0) {
+ Log-Error "No CPU VM sizes (D/E/F-series) are available in '$($env:AZURE_LOCATION)'."
+ Log-Info "AKS requires CPU nodes for system and workload pools."
+ Log-Info "Try a different region: run 'azd env set AZURE_LOCATION ' and re-run 'azd up'."
+ Log-Info "Recommended regions: eastus2, westus3, southcentralus, northeurope"
+ exit 1
+}
+
+if ($gpuCount -eq 0) {
+ Log-Warning "No GPU VM sizes (NC/NV/ND-series) are available in '$($env:AZURE_LOCATION)'."
+ Log-Info "GPU pools are required for video processing. Consider a region with GPU support."
+ Log-Info "Try: run 'azd env set AZURE_LOCATION ' and re-run 'azd up'."
+ Log-Info "Recommended GPU regions: eastus2, westus3, southcentralus, northeurope"
+ exit 1
+}
# Determine current/default values (env var overrides config default)
$currentSystemVm = if ($env:SYSTEM_VM_SIZE) { $env:SYSTEM_VM_SIZE } else { $DEFAULT_SYSTEM_VM_SIZE }
@@ -132,35 +204,44 @@ $currentWorkloadVm = if ($env:WORKLOAD_VM_SIZE) { $env:WORKLOAD_VM_SIZE
$currentDeepstreamVm = if ($env:DEEPSTREAM_GPU_VM_SIZE) { $env:DEEPSTREAM_GPU_VM_SIZE } else { $DEFAULT_DEEPSTREAM_GPU_SIZE }
$currentInferenceVm = if ($env:INFERENCE_GPU_VM_SIZE) { $env:INFERENCE_GPU_VM_SIZE } else { $DEFAULT_INFERENCE_GPU_SIZE }
-# Pick recommended VM sizes per pool (4 families x 3 sizes = ~12 items each)
-$systemVmSizes = Select-VmSizesForMenu -AllSizes $allVmSizes -Prefixes $CPU_VM_PREFIXES -RecommendedFamilies $SYSTEM_RECOMMENDED_FAMILIES -DefaultSku $currentSystemVm -SizesPerFamily $SIZES_PER_FAMILY -MaxCores $SYSTEM_MAX_CORES
-$workloadVmSizes = Select-VmSizesForMenu -AllSizes $allVmSizes -Prefixes $CPU_VM_PREFIXES -RecommendedFamilies $WORKLOAD_RECOMMENDED_FAMILIES -DefaultSku $currentWorkloadVm -SizesPerFamily $SIZES_PER_FAMILY -MinCores $WORKLOAD_MIN_CORES
-$gpuVmSizes = Select-VmSizesForMenu -AllSizes $allVmSizes -Prefixes $GPU_VM_PREFIXES -RecommendedFamilies $GPU_RECOMMENDED_FAMILIES -DefaultSku $currentDeepstreamVm -SizesPerFamily $SIZES_PER_FAMILY
+Log-Info "Querying VM quota in region..."
+$quotaData = Get-AzVmQuotaForRegion -Location $env:AZURE_LOCATION
+if ($quotaData -and $quotaData.Keys.Count -gt 0) {
+ Log-Success "Quota data retrieved for $($quotaData.Keys.Count) VM families"
+}
+else {
+ Log-Warning "Could not retrieve VM quota data for '$($env:AZURE_LOCATION)'. Menus will show all SKUs without quota filtering."
+ $quotaData = @{}
+}
-# Resolve defaults — if the configured default isn't available, pick the closest match
-$currentSystemVm = Resolve-DefaultSku -DefaultSku $currentSystemVm -AvailableSizes $systemVmSizes -PreferCores 4
-$currentWorkloadVm = Resolve-DefaultSku -DefaultSku $currentWorkloadVm -AvailableSizes $workloadVmSizes -PreferCores 32
-$currentDeepstreamVm = Resolve-DefaultSku -DefaultSku $currentDeepstreamVm -AvailableSizes $gpuVmSizes -PreferCores 24
-$currentInferenceVm = Resolve-DefaultSku -DefaultSku $currentInferenceVm -AvailableSizes $gpuVmSizes -PreferCores 24
+# Pick recommended VM sizes per pool (4 families x 3 sizes = ~12 items each).
+# QuotaData filters out SKUs whose family can't satisfy the pool's max node count.
+$systemVmSizes = Select-VmSizesForMenu -AllSizes $allVmSizes -Prefixes $CPU_VM_PREFIXES -RecommendedFamilies $SYSTEM_RECOMMENDED_FAMILIES -DefaultSku $currentSystemVm -SizesPerFamily $SIZES_PER_FAMILY -MaxCores $SYSTEM_MAX_CORES -QuotaData $quotaData -MaxNodes $SYSTEM_MAX_NODE_COUNT
+$workloadVmSizes = Select-VmSizesForMenu -AllSizes $allVmSizes -Prefixes $CPU_VM_PREFIXES -RecommendedFamilies $WORKLOAD_RECOMMENDED_FAMILIES -DefaultSku $currentWorkloadVm -SizesPerFamily $SIZES_PER_FAMILY -MinCores $WORKLOAD_MIN_CORES -QuotaData $quotaData -MaxNodes $WORKLOAD_MAX_NODE_COUNT
+$deepstreamVmSizes = Select-VmSizesForMenu -AllSizes $allVmSizes -Prefixes $GPU_VM_PREFIXES -RecommendedFamilies $GPU_RECOMMENDED_FAMILIES -DefaultSku $currentDeepstreamVm -SizesPerFamily $SIZES_PER_FAMILY -QuotaData $quotaData -MaxNodes $DEEPSTREAM_GPU_MAX_NODE_COUNT
+$inferenceVmSizes = Select-VmSizesForMenu -AllSizes $allVmSizes -Prefixes $GPU_VM_PREFIXES -RecommendedFamilies $GPU_RECOMMENDED_FAMILIES -DefaultSku $currentInferenceVm -SizesPerFamily $SIZES_PER_FAMILY -QuotaData $quotaData -MaxNodes $INFERENCE_GPU_MAX_NODE_COUNT
-Write-KeyValue "System pool" "$($systemVmSizes.Count) sizes"
-Write-KeyValue "Workload pool" "$($workloadVmSizes.Count) sizes"
-Write-KeyValue "GPU pool" "$($gpuVmSizes.Count) sizes"
+# Resolve defaults — if the configured default isn't available, pick the closest match
+$currentSystemVm = Resolve-DefaultSku -DefaultSku $currentSystemVm -AvailableSizes $systemVmSizes -PreferCores 4
+$currentWorkloadVm = Resolve-DefaultSku -DefaultSku $currentWorkloadVm -AvailableSizes $workloadVmSizes -PreferCores 32
+$currentDeepstreamVm = Resolve-DefaultSku -DefaultSku $currentDeepstreamVm -AvailableSizes $deepstreamVmSizes -PreferCores 24
+$currentInferenceVm = Resolve-DefaultSku -DefaultSku $currentInferenceVm -AvailableSizes $inferenceVmSizes -PreferCores 24
-Log-Info "Querying GPU quota..."
-$quotaData = Get-AzVmQuotaForRegion -Location $env:AZURE_LOCATION
-Log-Success "GPU quota data retrieved"
+Write-KeyValue "System pool" "$($systemVmSizes.Count) sizes (with quota)"
+Write-KeyValue "Workload pool" "$($workloadVmSizes.Count) sizes (with quota)"
+Write-KeyValue "Deepstream pool" "$($deepstreamVmSizes.Count) sizes (with quota)"
+Write-KeyValue "Inference pool" "$($inferenceVmSizes.Count) sizes (with quota)"
Write-Section "Choose a VM SKU for each AKS node pool"
Log-Info "The default is highlighted. Press Enter to accept, C for custom."
-# CPU pools (separate lists for system vs workload)
-$selectedSystem = Show-VmSelectionMenu -PoolName "System (CPU)" -EnvVarName "SYSTEM_VM_SIZE" -VmSizes $systemVmSizes -DefaultSku $currentSystemVm -Location $env:AZURE_LOCATION
-$selectedWorkload = Show-VmSelectionMenu -PoolName "Workload (CPU)" -EnvVarName "WORKLOAD_VM_SIZE" -VmSizes $workloadVmSizes -DefaultSku $currentWorkloadVm -Location $env:AZURE_LOCATION
+# CPU pools (quota-filtered lists, node count determines total cores checked)
+$selectedSystem = Show-VmSelectionMenu -PoolName "System (CPU)" -EnvVarName "SYSTEM_VM_SIZE" -VmSizes $systemVmSizes -DefaultSku $currentSystemVm -Location $env:AZURE_LOCATION -MaxNodes $SYSTEM_MAX_NODE_COUNT -QuotaData $quotaData
+$selectedWorkload = Show-VmSelectionMenu -PoolName "Workload (CPU)" -EnvVarName "WORKLOAD_VM_SIZE" -VmSizes $workloadVmSizes -DefaultSku $currentWorkloadVm -Location $env:AZURE_LOCATION -MaxNodes $WORKLOAD_MAX_NODE_COUNT -QuotaData $quotaData
# GPU pools (quota validated inline, node count determines total cores checked)
-$selectedDeepstream = Show-VmSelectionMenu -PoolName "Deepstream (GPU)" -EnvVarName "DEEPSTREAM_GPU_VM_SIZE" -VmSizes $gpuVmSizes -DefaultSku $currentDeepstreamVm -Location $env:AZURE_LOCATION -IsGpu -MaxNodes $DEEPSTREAM_GPU_MAX_NODE_COUNT -QuotaData $quotaData
-$selectedInference = Show-VmSelectionMenu -PoolName "Inference (GPU)" -EnvVarName "INFERENCE_GPU_VM_SIZE" -VmSizes $gpuVmSizes -DefaultSku $currentInferenceVm -Location $env:AZURE_LOCATION -IsGpu -MaxNodes $INFERENCE_GPU_MAX_NODE_COUNT -QuotaData $quotaData
+$selectedDeepstream = Show-VmSelectionMenu -PoolName "Deepstream (GPU)" -EnvVarName "DEEPSTREAM_GPU_VM_SIZE" -VmSizes $deepstreamVmSizes -DefaultSku $currentDeepstreamVm -Location $env:AZURE_LOCATION -IsGpu -MaxNodes $DEEPSTREAM_GPU_MAX_NODE_COUNT -QuotaData $quotaData
+$selectedInference = Show-VmSelectionMenu -PoolName "Inference (GPU)" -EnvVarName "INFERENCE_GPU_VM_SIZE" -VmSizes $inferenceVmSizes -DefaultSku $currentInferenceVm -Location $env:AZURE_LOCATION -IsGpu -MaxNodes $INFERENCE_GPU_MAX_NODE_COUNT -QuotaData $quotaData
# Update script-scope variables for downstream consumers
$DEEPSTREAM_GPU_VM_SIZE = $selectedDeepstream.Sku
diff --git a/hooks/preprovision.sh b/hooks/preprovision.sh
index fe4a79d..7571524 100755
--- a/hooks/preprovision.sh
+++ b/hooks/preprovision.sh
@@ -4,8 +4,7 @@
# Pre-Provision Script: Validate prerequisites before provisioning
# =============================================================================
-set -e
-
+set -eo pipefail
source "$(dirname "$0")/common.sh"
TOTAL_STEPS=6
@@ -17,14 +16,14 @@ write_foundry_banner "Pre-Provision Validation"
# =====================================================
log_step 1 $TOTAL_STEPS "Checking Azure CLI Authentication"
-ACCOUNT_INFO=$(az account show --query "{name:name, id:id}" -o tsv 2>/dev/null || true)
+ACCOUNT_INFO=$(az account show --query "{name:name, id:id}" -o tsv 2>/dev/null | tr -d '\r' || true)
if [ -z "$ACCOUNT_INFO" ]; then
log_error "Not logged in to Azure CLI. Run 'az login' before provisioning."
exit 1
fi
-ACCOUNT_NAME=$(az account show --query "name" -o tsv 2>/dev/null)
+ACCOUNT_NAME=$(az account show --query "name" -o tsv 2>/dev/null | tr -d '\r')
log_success "Signed in to account: ${ACCOUNT_NAME}"
if [ -n "${AZURE_SUBSCRIPTION_ID:-}" ]; then
@@ -33,9 +32,9 @@ if [ -n "${AZURE_SUBSCRIPTION_ID:-}" ]; then
log_info "Verify the subscription ID and your access permissions."
exit 1
}
- SUB_NAME=$(az account show --query "name" -o tsv 2>/dev/null)
+ SUB_NAME=$(az account show --query "name" -o tsv 2>/dev/null | tr -d '\r')
log_success "Subscription: ${SUB_NAME}"
- write_key_value "ID" "$AZURE_SUBSCRIPTION_ID"
+ log_success "ID ${AZURE_SUBSCRIPTION_ID}"
fi
# =====================================================
@@ -51,7 +50,7 @@ assert_cli_tools az helm kubectl -- kubelogin jq
log_step 3 $TOTAL_STEPS "Validating Environment Variables"
for var in AZURE_SUBSCRIPTION_ID AZURE_LOCATION AZURE_ENV_NAME; do
- eval val=\$$var 2>/dev/null || val=""
+ val="${!var:-}"
if [ -z "$val" ]; then
write_health_row "$var" "Fail" "not set"
else
@@ -65,11 +64,11 @@ assert_env_vars AZURE_SUBSCRIPTION_ID AZURE_LOCATION AZURE_ENV_NAME
STORAGE_SKU="${STORAGE_SKU_NAME:-Standard_LRS}"
if [ "$STORAGE_SKU" = "Standard_ZRS" ]; then
log_info "Checking ZRS availability in ${AZURE_LOCATION}..."
- ZRS_AVAILABLE=$(az provider show --namespace Microsoft.Storage \
- --query "resourceTypes[?resourceType=='storageAccounts'].zoneMappings[?contains(location, '${AZURE_LOCATION}')].location | [0][0]" \
- -o tsv 2>/dev/null || true)
+ ZRS_AVAILABLE=$(az storage account list-skus --location "$AZURE_LOCATION" \
+ --query "[?name=='Standard_ZRS'].name | [0]" \
+ -o tsv 2>/dev/null | tr -d '\r' || true)
if [ -z "$ZRS_AVAILABLE" ]; then
- log_warning "Standard_ZRS may not be available in '${AZURE_LOCATION}'."
+ log_warning "Requested Standard_ZRS is not available in '${AZURE_LOCATION}'."
log_warning "Falling back to Standard_LRS to avoid deployment failure."
azd env set STORAGE_SKU_NAME Standard_LRS 2>/dev/null || true
STORAGE_SKU="Standard_LRS"
@@ -77,7 +76,42 @@ if [ "$STORAGE_SKU" = "Standard_ZRS" ]; then
log_success "ZRS is available in ${AZURE_LOCATION}"
fi
fi
-write_key_value "Storage SKU" "$STORAGE_SKU"
+write_key_value "STORAGE_SKU" "$STORAGE_SKU"
+
+# ── Validate Kubernetes version against region capabilities ──────────────
+# Pin minors drift from standard support to LTS-only (e.g. 1.32 → LTS).
+# If the requested minor is not available on the standard "KubernetesOfficial"
+# support plan in this region, auto-fall back to the region's default minor.
+REQUESTED_K8S="${KUBERNETES_VERSION:-1.34}"
+K8S_VERSIONS_JSON=$(az aks get-versions --location "$AZURE_LOCATION" -o json 2>/dev/null || true)
+if [ -n "$K8S_VERSIONS_JSON" ] && command -v jq >/dev/null 2>&1; then
+ K8S_STANDARD=$(echo "$K8S_VERSIONS_JSON" | jq -r '.values[] | select(.capabilities.supportPlan | contains(["KubernetesOfficial"])) | .version' 2>/dev/null || true)
+ K8S_DEFAULT=$(echo "$K8S_VERSIONS_JSON" | jq -r '.values[] | select(.isDefault==true) | .version' 2>/dev/null | head -n1)
+ if ! echo "$K8S_STANDARD" | grep -qx "$REQUESTED_K8S"; then
+ if [ -n "$K8S_DEFAULT" ]; then
+ log_warning "Kubernetes '${REQUESTED_K8S}' is not on standard support in '${AZURE_LOCATION}'."
+ log_warning "Falling back to region default: '${K8S_DEFAULT}'."
+ azd_env_set KUBERNETES_VERSION "$K8S_DEFAULT" || true
+ export KUBERNETES_VERSION="$K8S_DEFAULT"
+ REQUESTED_K8S="$K8S_DEFAULT"
+ else
+ log_warning "Kubernetes '${REQUESTED_K8S}' is not on standard support and no default could be resolved."
+ fi
+ fi
+ write_key_value "KUBERNETES_VERSION" "$REQUESTED_K8S"
+else
+ log_warning "Could not query AKS versions in '${AZURE_LOCATION}'. Proceeding with '${REQUESTED_K8S}'."
+fi
+
+# Persist the resolved CREATE_FOUNDRY_PROJECT value so Bicep + all downstream
+# hooks agree on the same boolean. (Bicep's default is true; the hooks default
+# to true to match.)
+if azd env set CREATE_FOUNDRY_PROJECT "$CREATE_FOUNDRY_PROJECT" 2>/dev/null; then
+ export CREATE_FOUNDRY_PROJECT
+ write_key_value "CREATE_FOUNDRY_PROJECT" $CREATE_FOUNDRY_PROJECT
+else
+ log_warning "Could not persist CREATE_FOUNDRY_PROJECT via 'azd env set'."
+fi
# =====================================================
# Step 4: Resolve AI Model Quota (if Foundry enabled)
@@ -113,35 +147,74 @@ log_success "Found VM SKUs in ${AZURE_LOCATION}"
log_info "${#ALL_CPU_VMS[@]} CPU + ${#ALL_GPU_VMS[@]} GPU sizes available"
+if [ "${#ALL_CPU_VMS[@]}" -eq 0 ] && [ "${#ALL_GPU_VMS[@]}" -eq 0 ]; then
+ log_error "No VM sizes are available in '${AZURE_LOCATION}' for this subscription."
+ log_info "This usually means the region has restrictive SKU policies for your subscription."
+ log_info "Try a different region: run 'azd env set AZURE_LOCATION ' and re-run 'azd up'."
+ log_info "Recommended regions: eastus2, westus3, southcentralus, northeurope"
+ exit 1
+fi
+
+if [ "${#ALL_CPU_VMS[@]}" -eq 0 ]; then
+ log_error "No CPU VM sizes (D/E/F-series) are available in '${AZURE_LOCATION}'."
+ log_info "AKS requires CPU nodes for system and workload pools."
+ log_info "Try a different region: run 'azd env set AZURE_LOCATION ' and re-run 'azd up'."
+ log_info "Recommended regions: eastus2, westus3, southcentralus, northeurope"
+ exit 1
+fi
+
+if [ "${#ALL_GPU_VMS[@]}" -eq 0 ]; then
+ log_warning "No GPU VM sizes (NC/NV/ND-series) are available in '${AZURE_LOCATION}'."
+ log_info "GPU pools are required for video processing. Consider a region with GPU support."
+ log_info "Try: run 'azd env set AZURE_LOCATION ' and re-run 'azd up'."
+ log_info "Recommended GPU regions: eastus2, westus3, southcentralus, northeurope"
+ exit 1
+fi
+
+log_info "Querying VM quota in region..."
+if fetch_vm_quota_map "$AZURE_LOCATION"; then
+ log_success "Quota data retrieved for ${VM_QUOTA_MAP_COUNT} VM families"
+else
+ log_warning "Could not retrieve VM quota data; menus will show all SKUs."
+fi
+
select_vm_sizes_for_menu ALL_CPU_VMS SYSTEM_VMS SYSTEM_RECOMMENDED_FAMILIES "$CURRENT_SYSTEM_VM" "$SIZES_PER_FAMILY" 0 "$SYSTEM_MAX_CORES"
select_vm_sizes_for_menu ALL_CPU_VMS WORKLOAD_VMS WORKLOAD_RECOMMENDED_FAMILIES "$CURRENT_WORKLOAD_VM" "$SIZES_PER_FAMILY" "$WORKLOAD_MIN_CORES"
-select_vm_sizes_for_menu ALL_GPU_VMS GPU_VMS GPU_RECOMMENDED_FAMILIES "$CURRENT_DEEPSTREAM_VM" "$SIZES_PER_FAMILY"
+select_vm_sizes_for_menu ALL_GPU_VMS DEEPSTREAM_VMS GPU_RECOMMENDED_FAMILIES "$CURRENT_DEEPSTREAM_VM" "$SIZES_PER_FAMILY"
+select_vm_sizes_for_menu ALL_GPU_VMS INFERENCE_VMS GPU_RECOMMENDED_FAMILIES "$CURRENT_INFERENCE_VM" "$SIZES_PER_FAMILY"
+
+# Annotate + filter by quota using each pool's max node count.
+annotate_vm_sizes_with_quota SYSTEM_VMS "$CURRENT_SYSTEM_VM" "$SYSTEM_MAX_NODE_COUNT"
+annotate_vm_sizes_with_quota WORKLOAD_VMS "$CURRENT_WORKLOAD_VM" "$WORKLOAD_MAX_NODE_COUNT"
+annotate_vm_sizes_with_quota DEEPSTREAM_VMS "$CURRENT_DEEPSTREAM_VM" "$DEEPSTREAM_GPU_MAX_NODE_COUNT"
+annotate_vm_sizes_with_quota INFERENCE_VMS "$CURRENT_INFERENCE_VM" "$INFERENCE_GPU_MAX_NODE_COUNT"
# Resolve defaults — if the configured default isn't available, pick the closest match
CURRENT_SYSTEM_VM=$(resolve_default_sku "$CURRENT_SYSTEM_VM" SYSTEM_VMS 4)
CURRENT_WORKLOAD_VM=$(resolve_default_sku "$CURRENT_WORKLOAD_VM" WORKLOAD_VMS 32)
-CURRENT_DEEPSTREAM_VM=$(resolve_default_sku "$CURRENT_DEEPSTREAM_VM" GPU_VMS 24)
-CURRENT_INFERENCE_VM=$(resolve_default_sku "$CURRENT_INFERENCE_VM" GPU_VMS 24)
+CURRENT_DEEPSTREAM_VM=$(resolve_default_sku "$CURRENT_DEEPSTREAM_VM" DEEPSTREAM_VMS 24)
+CURRENT_INFERENCE_VM=$(resolve_default_sku "$CURRENT_INFERENCE_VM" INFERENCE_VMS 24)
-write_key_value "System pool" "${#SYSTEM_VMS[@]} sizes"
-write_key_value "Workload pool" "${#WORKLOAD_VMS[@]} sizes"
-write_key_value "GPU pool" "${#GPU_VMS[@]} sizes"
+write_key_value "System pool" "${#SYSTEM_VMS[@]} sizes (with quota)"
+write_key_value "Workload pool" "${#WORKLOAD_VMS[@]} sizes (with quota)"
+write_key_value "Deepstream pool" "${#DEEPSTREAM_VMS[@]} sizes (with quota)"
+write_key_value "Inference pool" "${#INFERENCE_VMS[@]} sizes (with quota)"
write_section "Choose a VM SKU for each AKS node pool"
log_info "The default is highlighted. Press Enter to accept, C for custom."
-# CPU pools (separate lists for system vs workload)
-show_vm_selection_menu "System (CPU)" "SYSTEM_VM_SIZE" SYSTEM_VMS "$CURRENT_SYSTEM_VM" "$AZURE_LOCATION"
+# CPU pools (quota-filtered lists, node count determines total cores checked)
+show_vm_selection_menu "System (CPU)" "SYSTEM_VM_SIZE" SYSTEM_VMS "$CURRENT_SYSTEM_VM" "$AZURE_LOCATION" "$SYSTEM_MAX_NODE_COUNT"
SYSTEM_SKU="$SELECTED_VM_SKU"
-show_vm_selection_menu "Workload (CPU)" "WORKLOAD_VM_SIZE" WORKLOAD_VMS "$CURRENT_WORKLOAD_VM" "$AZURE_LOCATION"
+show_vm_selection_menu "Workload (CPU)" "WORKLOAD_VM_SIZE" WORKLOAD_VMS "$CURRENT_WORKLOAD_VM" "$AZURE_LOCATION" "$WORKLOAD_MAX_NODE_COUNT"
WORKLOAD_SKU="$SELECTED_VM_SKU"
# GPU pools (quota validated inline, node count determines total cores checked)
-show_vm_selection_menu "Deepstream (GPU)" "DEEPSTREAM_GPU_VM_SIZE" GPU_VMS "$CURRENT_DEEPSTREAM_VM" "$AZURE_LOCATION" "$DEEPSTREAM_GPU_MAX_NODE_COUNT" "gpu"
+show_vm_selection_menu "Deepstream (GPU)" "DEEPSTREAM_GPU_VM_SIZE" DEEPSTREAM_VMS "$CURRENT_DEEPSTREAM_VM" "$AZURE_LOCATION" "$DEEPSTREAM_GPU_MAX_NODE_COUNT" "gpu"
DEEPSTREAM_GPU_VM_SIZE="$SELECTED_VM_SKU"
-show_vm_selection_menu "Inference (GPU)" "INFERENCE_GPU_VM_SIZE" GPU_VMS "$CURRENT_INFERENCE_VM" "$AZURE_LOCATION" "$INFERENCE_GPU_MAX_NODE_COUNT" "gpu"
+show_vm_selection_menu "Inference (GPU)" "INFERENCE_GPU_VM_SIZE" INFERENCE_VMS "$CURRENT_INFERENCE_VM" "$AZURE_LOCATION" "$INFERENCE_GPU_MAX_NODE_COUNT" "gpu"
INFERENCE_GPU_VM_SIZE="$SELECTED_VM_SKU"
# =====================================================
diff --git a/hooks/ui.sh b/hooks/ui.sh
index 250bfcb..456377d 100755
--- a/hooks/ui.sh
+++ b/hooks/ui.sh
@@ -109,6 +109,12 @@ log_info() {
_write_log_message "$1" "$SYM_INFO" "$C_ACCENT"
}
+# Character count (not byte count) — safe for UTF-8 glyphs like symbols.
+# Use this instead of ${#var} when computing display widths.
+_str_len() {
+ printf '%s' "$1" | wc -m | tr -d ' '
+}
+
log_success() {
_write_log_message "$1" "$SYM_SUCCESS" "$C_SUCCESS" "$C_SUCCESS"
}
@@ -128,7 +134,7 @@ log_step() {
# Divider length matches the title line above (min 40)
local header_text="${SYM_STEP} [${number}/${total}] ${title}"
- local divider_len=${#header_text}
+ local divider_len; divider_len=$(_str_len "$header_text")
[ "$divider_len" -lt 40 ] && divider_len=40
local rule=""
@@ -169,7 +175,7 @@ write_box_banner() {
local spaces; spaces=$(_repeat_char " " "$inner_width")
# Centered title
- local text_len=${#text}
+ local text_len; text_len=$(_str_len "$text")
local left_pad=$(( (inner_width - text_len) / 2 ))
local right_pad=$(( inner_width - text_len - left_pad ))
local left_spaces; left_spaces=$(_repeat_char " " "$left_pad")
@@ -182,7 +188,7 @@ write_box_banner() {
"$color" "$right_spaces" "$v" "$C_RESET"
if [ -n "$subtitle" ]; then
- local sub_len=${#subtitle}
+ local sub_len; sub_len=$(_str_len "$subtitle")
local sub_left=$(( (inner_width - sub_len) / 2 ))
local sub_right=$(( inner_width - sub_len - sub_left ))
local sub_left_sp; sub_left_sp=$(_repeat_char " " "$sub_left")
@@ -231,7 +237,7 @@ write_foundry_banner() {
write_title() {
local text="$1"
- local text_len=${#text}
+ local text_len; text_len=$(_str_len "$text")
local rule; rule=$(_repeat_char "$SYM_HLINE" "$((text_len + 4))")
printf "\n %b%s%b\n" "$C_BOLD_WHITE" "$text" "$C_RESET"
printf " %b%s%b\n" "$C_ACCENT_DIM" "$rule" "$C_RESET"
diff --git a/hooks/validate-env.ps1 b/hooks/validate-env.ps1
index 31ed873..6ce57b5 100644
--- a/hooks/validate-env.ps1
+++ b/hooks/validate-env.ps1
@@ -36,7 +36,7 @@ Test-EnvVarFormat -VarName "AZURE_SUBSCRIPTION_ID" -Pattern $uuidPattern -Exampl
Test-EnvVarFormat -VarName "AZURE_PRINCIPAL_ID" -Pattern $uuidPattern -Example "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
# ── VM sizes must start with Standard_ ────────────────────────────────────
-$vmPattern = '^Standard_\w+'
+$vmPattern = '^Standard_[A-Za-z0-9_]+$'
Test-EnvVarFormat -VarName "SYSTEM_VM_SIZE" -Pattern $vmPattern -Example "Standard_D4a_v4"
Test-EnvVarFormat -VarName "WORKLOAD_VM_SIZE" -Pattern $vmPattern -Example "Standard_D32a_v4"
Test-EnvVarFormat -VarName "DEEPSTREAM_GPU_VM_SIZE" -Pattern $vmPattern -Example "Standard_NC24ads_A100_v4"
@@ -53,7 +53,7 @@ $skuPattern = '^Standard_(LRS|ZRS|GRS)$'
Test-EnvVarFormat -VarName "STORAGE_SKU_NAME" -Pattern $skuPattern -Example "Standard_LRS, Standard_ZRS, or Standard_GRS"
# ── Location must be non-empty if set ──────────────────────────────────────
-$locationPattern = '^[a-z0-9]+$'
+$locationPattern = '^[a-z0-9]+(-[a-z0-9]+)*$'
Test-EnvVarFormat -VarName "AZURE_LOCATION" -Pattern $locationPattern -Example "eastus2"
if ($hasError) {
diff --git a/hooks/validate-env.sh b/hooks/validate-env.sh
index cefe800..d86680a 100755
--- a/hooks/validate-env.sh
+++ b/hooks/validate-env.sh
@@ -8,8 +8,7 @@
# Pattern adapted from get-started-with-ai-agents/scripts/validate_env_vars.sh
# =============================================================================
-set -e
-
+set -eo pipefail
has_error=false
validate_env_var() {
@@ -38,7 +37,7 @@ validate_env_var "AZURE_SUBSCRIPTION_ID" "$uuid_pattern" "xxxxxxxx-xxxx-xxxx-xxx
validate_env_var "AZURE_PRINCIPAL_ID" "$uuid_pattern" "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
# ── VM sizes must start with Standard_ ────────────────────────────────────
-vm_pattern='^Standard_'
+vm_pattern='^Standard_[A-Za-z0-9_]+$'
validate_env_var "SYSTEM_VM_SIZE" "$vm_pattern" "Standard_D4a_v4"
validate_env_var "WORKLOAD_VM_SIZE" "$vm_pattern" "Standard_D32a_v4"
validate_env_var "DEEPSTREAM_GPU_VM_SIZE" "$vm_pattern" "Standard_NC24ads_A100_v4"
@@ -55,7 +54,7 @@ sku_pattern='^Standard_(LRS|ZRS|GRS)$'
validate_env_var "STORAGE_SKU_NAME" "$sku_pattern" "Standard_LRS, Standard_ZRS, or Standard_GRS"
# ── Location must be non-empty lowercase alphanumeric if set ───────────────
-location_pattern='^[a-z0-9]+$'
+location_pattern='^[a-z0-9]+(-[a-z0-9]+)*$'
validate_env_var "AZURE_LOCATION" "$location_pattern" "eastus2"
if [ "$has_error" = true ]; then
diff --git a/infra/main.bicep b/infra/main.bicep
index f5e10d0..0018425 100644
--- a/infra/main.bicep
+++ b/infra/main.bicep
@@ -23,7 +23,7 @@ param principalId string = ''
param createRoleForUser bool
@description('Whether to create a Foundry project and link it to the VI extension')
-param createFoundryProject bool
+param createFoundryProject bool = true
@description('Model name to deploy in AI Foundry')
param aiModelName string
diff --git a/infra/main.parameters.json b/infra/main.parameters.json
index 3372c50..073a886 100644
--- a/infra/main.parameters.json
+++ b/infra/main.parameters.json
@@ -18,7 +18,7 @@
"value": "${CREATE_ROLE_FOR_USER=true}"
},
"kubernetesVersion": {
- "value": "${KUBERNETES_VERSION=1.32}"
+ "value": "${KUBERNETES_VERSION=1.34}"
},
"systemVmSize": {
"value": "${SYSTEM_VM_SIZE=Standard_D4a_v4}"
diff --git a/infra/modules/aks.bicep b/infra/modules/aks.bicep
index cff9534..f47e71b 100644
--- a/infra/modules/aks.bicep
+++ b/infra/modules/aks.bicep
@@ -22,10 +22,10 @@ param deepstreamGpuVmSize string
@description('VM size for the GPU inference workload node pool')
param inferenceGpuVmSize string
-@description('Maximum number of system nodes')
+@description('Number of nodes in the system node pool (fixed count — not autoscaled)')
@minValue(1)
@maxValue(10)
-param systemMaxNodeCount int = 2
+param systemNodeCount int = 2
@description('Maximum number of workload nodes')
@minValue(1)
@@ -48,10 +48,10 @@ param dnsPrefix string = name
@description('Node resource group name')
param nodeResourceGroup string
-@description('Node label value used to target deepstream workloads')
+// Node label value used to target deepstream workloads
var deepstreamWorkloadLabel = 'deepstream'
-@description('Node label value used to target inference workloads')
+// Node label value used to target inference workloads
var inferenceWorkloadLabel = 'inference'
resource aksCluster 'Microsoft.ContainerService/managedClusters@2024-09-01' = {
@@ -111,7 +111,7 @@ resource aksCluster 'Microsoft.ContainerService/managedClusters@2024-09-01' = {
agentPoolProfiles: [
{
name: 'system'
- count: systemMaxNodeCount
+ count: systemNodeCount
vmSize: systemVmSize
osType: 'Linux'
osSKU: 'AzureLinux'