diff --git a/.github/workflows/azure-dev.yml b/.github/workflows/azure-dev.yml index 9dd0621..bf6fca2 100644 --- a/.github/workflows/azure-dev.yml +++ b/.github/workflows/azure-dev.yml @@ -59,21 +59,27 @@ jobs: uses: azure/setup-kubectl@v3 - name: Install helm uses: azure/setup-helm@v4 - - name: Log in with Azure (Federated Credentials) + - name: Login to Azure + uses: azure/login@v2 + with: + client-id: ${{ vars.AZURE_CLIENT_ID }} + tenant-id: ${{ vars.AZURE_TENANT_ID }} + subscription-id: ${{ vars.AZURE_SUBSCRIPTION_ID }} + - name: Set Azure subscription + shell: bash + run: az account set --subscription "$AZURE_SUBSCRIPTION_ID" + - name: Login to AZD + shell: bash run: | - azd auth login ` - --client-id "$Env:AZURE_CLIENT_ID" ` - --federated-credential-provider "github" ` - --tenant-id "$Env:AZURE_TENANT_ID" - shell: pwsh - - - name: Provision Infrastructure - run: azd provision --no-prompt - env: - AZD_INITIAL_ENVIRONMENT_CONFIG: ${{ secrets.AZD_INITIAL_ENVIRONMENT_CONFIG }} - - # NOTE: azure.yaml has no services: block, so azd deploy is a no-op. - # The postprovision hook handles all post-Bicep setup (Arc, GPU, VI extension). - # This step is kept for forward-compatibility if app services are added. - - name: Deploy Application - run: azd deploy --no-prompt + azd auth login \ + --client-id "$AZURE_CLIENT_ID" \ + --federated-credential-provider "github" \ + --tenant-id "$AZURE_TENANT_ID" + - name: Provision + shell: bash + run: | + if ! azd env select "$AZURE_ENV_NAME"; then + azd env new "$AZURE_ENV_NAME" --subscription "$AZURE_SUBSCRIPTION_ID" --location "$AZURE_LOCATION" --no-prompt + fi + azd config set defaults.subscription "$AZURE_SUBSCRIPTION_ID" + azd up --no-prompt diff --git a/README.md b/README.md index bf8194f..8badad3 100644 --- a/README.md +++ b/README.md @@ -25,11 +25,11 @@ By deploying the Video Indexer Arc extension on an Arc-enabled AKS cluster with This solution optionally creates a Microsoft Foundry project and Foundry Tools (enabled by default; set `CREATE_FOUNDRY_PROJECT=false` to skip). More details about the resources can be found in the [resources](#resources) documentation. -### Solution architecture +## Solution architecture |![image](./docs/images/readme/architecture.png)| |---| -### Key features +## Features
Learn more about the key features this solution enables @@ -44,9 +44,10 @@ This solution optionally creates a Microsoft Foundry project and Foundry Tools (

-Getting Started +Quick Deploy

+## Getting Started Follow the quick deploy steps on the deployment guide to deploy this solution to your own Azure subscription. > **Note:** This solution accelerator requires **Azure Developer CLI (azd) version 1.18.0 or higher**. Please ensure you have the latest version installed before proceeding with deployment. [Download azd here](https://learn.microsoft.com/en-us/azure/developer/azure-developer-cli/install-azd). diff --git a/azure.yaml b/azure.yaml index feac005..fb06423 100644 --- a/azure.yaml +++ b/azure.yaml @@ -50,13 +50,6 @@ hooks: shell: sh run: ./hooks/postup.sh interactive: true - predown: - windows: - shell: pwsh - run: ./hooks/predown.ps1 - posix: - shell: sh - run: ./hooks/predown.sh pipeline: variables: - AZURE_RESOURCE_GROUP diff --git a/hooks/common.ps1 b/hooks/common.ps1 index 40627ce..913659c 100644 --- a/hooks/common.ps1 +++ b/hooks/common.ps1 @@ -7,6 +7,54 @@ . "$PSScriptRoot/config.ps1" . "$PSScriptRoot/ui.ps1" +# ── Helpers ───────────────────────────────────────────────────────────────── + +function Invoke-AzJson { + <# + .SYNOPSIS + Runs an `az` command (passed as a script block or string array), captures + stdout, and returns parsed JSON. Returns $null if the command produced no + output or if parsing failed. Never throws — callers must null-check. + #> + param([Parameter(Mandatory)] [scriptblock]$Command) + try { + $raw = & $Command 2>&1 + if ($LASTEXITCODE -ne 0) { + $snippet = if ($raw) { ($raw | Out-String).Trim() } else { '(no output)' } + if ($snippet.Length -gt 400) { $snippet = $snippet.Substring(0, 400) + '...' } + Log-Warning "az command failed (exit $LASTEXITCODE): $snippet" + return $null + } + if ($null -eq $raw -or ($raw -is [string] -and [string]::IsNullOrWhiteSpace($raw))) { + return $null + } + if ($raw -is [array]) { $raw = ($raw -join "`n") } + if ([string]::IsNullOrWhiteSpace($raw)) { return $null } + return $raw | ConvertFrom-Json -ErrorAction Stop + } + catch { + Log-Warning "Invoke-AzJson exception: $($_.Exception.Message)" + return $null + } +} + +function Invoke-AzdEnvSet { + <# + .SYNOPSIS + Persists a key/value via `azd env set`, warning (but not failing) on error. + #> + param( + [Parameter(Mandatory)] [string]$Name, + [Parameter(Mandatory)] [AllowEmptyString()] [string]$Value + ) + & azd env set $Name $Value 2>$null + if ($LASTEXITCODE -ne 0) { + Log-Warning "Could not persist '$Name' via 'azd env set' (exit $LASTEXITCODE)." + return $false + } + return $true +} + # ── Prerequisite Checks ───────────────────────────────────────────────────── function Assert-EnvVars { @@ -125,16 +173,21 @@ function Connect-AksCluster { function Get-RunningPodCount { <# .SYNOPSIS - Returns the number of Running pods in a given namespace. + Returns the number of "healthy" pods in a namespace. + Counts both Running and Succeeded pods — some workloads (e.g. the + GPU operator's cuda-validator / install jobs) intentionally end in + Succeeded and should not be reported as degraded. #> param( [string]$Namespace, [string]$KubeContext ) try { - $count = (kubectl --context $KubeContext get pods -n $Namespace ` + $running = (kubectl --context $KubeContext get pods -n $Namespace ` --field-selector=status.phase=Running --no-headers 2>$null | Measure-Object -Line).Lines - return [int]$count + $succeeded = (kubectl --context $KubeContext get pods -n $Namespace ` + --field-selector=status.phase=Succeeded --no-headers 2>$null | Measure-Object -Line).Lines + return [int]$running + [int]$succeeded } catch { return 0 @@ -186,14 +239,15 @@ function Get-AzVmSizesForRegion { Queries az vm list-skus for a region and returns an array of hashtables. Uses list-skus (not list-sizes) to respect subscription restrictions — only SKUs the subscription is allowed to use are returned. - Each entry: @{ Name; Cores; MemoryGB } + Each entry: @{ Name; Cores; MemoryGB; Family } + Family is the quota family name (matches az vm list-usage name.value). #> param([string]$Location) - $query = "[?restrictions[?type=='Location']|length(@)==``0``].{name:name, vCPUs:capabilities[?name=='vCPUs'].value|[0], memGB:capabilities[?name=='MemoryGB'].value|[0]}" - $raw = (az vm list-skus --location $Location --resource-type virtualMachines --query $query -o json 2>$null) | ConvertFrom-Json + $query = "[?restrictions[?type=='Location']|length(@)==``0``].{name:name, family:family, vCPUs:capabilities[?name=='vCPUs'].value|[0], memGB:capabilities[?name=='MemoryGB'].value|[0]}" + $raw = Invoke-AzJson { az vm list-skus --location $Location --resource-type virtualMachines --query $query -o json } if (-not $raw) { return @() } return $raw | ForEach-Object { - @{ Name = $_.name; Cores = [int]$_.vCPUs; MemoryGB = [int]$_.memGB } + @{ Name = $_.name; Cores = [int]$_.vCPUs; MemoryGB = [int]$_.memGB; Family = $_.family } } } @@ -238,6 +292,11 @@ function Select-VmSizesForMenu { For each family, matches VM names that contain the family pattern, filters by core range, takes up to $SizesPerFamily sorted by cores. The default SKU is always included. Result is sorted by cores. + + When QuotaData is supplied, each entry is annotated with quota info + (AvailableQuota, QuotaLimit, QuotaFamily, HasEnoughQuota). SKUs whose + family has too little quota to run $MaxNodes of that size are dropped, + except the default SKU which is kept (annotated) so the user sees it. #> param( [array]$AllSizes, @@ -246,7 +305,9 @@ function Select-VmSizesForMenu { [string]$DefaultSku, [int]$SizesPerFamily = 3, [int]$MinCores = 0, - [int]$MaxCores = [int]::MaxValue + [int]$MaxCores = [int]::MaxValue, + [hashtable]$QuotaData, + [int]$MaxNodes = 1 ) # Filter by broad prefix first (CPU vs GPU), then by core range @@ -273,8 +334,49 @@ function Select-VmSizesForMenu { if ($defVm) { $selected[$defVm.Name] = $defVm } } - # Return sorted by cores - return $selected.Values | Sort-Object { $_.Cores }, { $_.Name } + $entries = $selected.Values | Sort-Object { $_.Cores }, { $_.Name } + + # ── Annotate with quota + drop SKUs that cannot satisfy $MaxNodes ───── + if ($QuotaData -and $QuotaData.Count -gt 0) { + $annotated = @() + foreach ($vm in $entries) { + $fam = Get-QuotaFamilyForVm -VmSize $vm.Name -SkuFamily $vm.Family + $avail = $null; $limit = $null; $hasEnough = $true; $familyKnown = $false + if ($fam -and $QuotaData.ContainsKey($fam)) { + $familyKnown = $true + $avail = [int]$QuotaData[$fam].Available + $limit = [int]$QuotaData[$fam].Limit + $needed = [int]$vm.Cores * [math]::Max(1, [int]$MaxNodes) + $hasEnough = ($limit -gt 0) -and ($avail -ge $needed) + } + # Clone hashtable so we don't mutate the shared $AllSizes entries + $copy = @{} + foreach ($k in $vm.Keys) { $copy[$k] = $vm[$k] } + $copy.QuotaFamily = $fam + $copy.QuotaFamilyKnown = $familyKnown + $copy.AvailableQuota = $avail + $copy.QuotaLimit = $limit + $copy.HasEnoughQuota = $hasEnough + $annotated += ,$copy + } + + # Keep SKUs that either (a) have enough quota in a known family, + # (b) are in an unknown family (can't verify — don't hide newer SKUs), + # or (c) are the configured default. Unknown families are treated as + # "OK" here; the final quota check before submission still guards them. + $filtered = $annotated | Where-Object { + (-not $_.QuotaFamilyKnown) -or $_.HasEnoughQuota -or ($_.Name -eq $DefaultSku) + } + + # Fallback: if quota would empty the list, return the unfiltered annotated + # set so the user can still pick something and be warned. + if (-not $filtered -or @($filtered).Count -eq 0) { + return $annotated + } + return @($filtered) + } + + return $entries } function Get-AzVmQuotaForRegion { @@ -285,7 +387,7 @@ function Get-AzVmQuotaForRegion { #> param([string]$Location) $result = @{} - $raw = (az vm list-usage --location $Location -o json 2>$null) | ConvertFrom-Json + $raw = Invoke-AzJson { az vm list-usage --location $Location -o json } if (-not $raw) { return $result } foreach ($q in $raw) { $result[$q.name.value] = @{ @@ -300,14 +402,21 @@ function Get-AzVmQuotaForRegion { function Get-QuotaFamilyForVm { <# .SYNOPSIS - Resolves the quota family name for a GPU VM size using the - GPU_QUOTA_FAMILY_MAP regex lookup table. No API call needed. - Returns the family string, or $null if no pattern matches. + Resolves the quota family name for a VM size. + Prefers the family string reported by az vm list-skus (passed via + -SkuFamily) because it matches az vm list-usage's name.value directly. + Falls back to the GPU_QUOTA_FAMILY_MAP regex table for older SKUs where + the family field is empty. + Returns the family string, or $null if nothing matches. #> param( [string]$VmSize, + [string]$SkuFamily, [string]$Location # kept for interface compat, not used ) + if (-not [string]::IsNullOrWhiteSpace($SkuFamily)) { + return $SkuFamily + } foreach ($pattern in $GPU_QUOTA_FAMILY_MAP.Keys) { if ($VmSize -match $pattern) { return $GPU_QUOTA_FAMILY_MAP[$pattern] @@ -378,12 +487,10 @@ function Resolve-ModelQuota { $modelType = "$Format.$DeploymentType.$Model" Log-Info "Checking quota for $modelType in $Location..." - $modelInfo = $null - try { - $modelInfo = (az cognitiveservices usage list --location $Location ` - --query "[?name.value=='$modelType'] | [0]" -o json 2>$null) | ConvertFrom-Json + $modelInfo = Invoke-AzJson { + az cognitiveservices usage list --location $Location ` + --query "[?name.value=='$modelType'] | [0]" -o json } - catch { } if (-not $modelInfo) { Log-Warning "No quota info found for '$modelType' in '$Location'. Skipping quota check." @@ -421,7 +528,7 @@ function Resolve-ModelQuota { } } while (-not $validInput) - azd env set $CapacityEnvVarName $parsed 2>$null + [void](Invoke-AzdEnvSet -Name $CapacityEnvVarName -Value "$parsed") Log-Success "Capacity adjusted to $parsed (saved to $CapacityEnvVarName)" } else { @@ -502,8 +609,20 @@ function Show-VmSelectionMenu { $name = $Entry.Name.PadRight(35) $cores = "$($Entry.Cores) vCPUs".PadRight(10) $mem = "$($Entry.MemoryGB) GB".PadRight(8) - $tag = if ($IsDefault) { " (default)" } else { "" } - return "${name} ${cores} ${mem}${tag}" + + # Quota column (only when quota data was supplied) + $quotaCol = "" + if ($Entry.ContainsKey('QuotaFamilyKnown')) { + if ($Entry.QuotaFamilyKnown) { + $quotaCol = "$($Entry.AvailableQuota) free".PadRight(14) + } + else { + $quotaCol = "quota n/a".PadRight(14) + } + } + + $tag = if ($IsDefault) { " (default)" } else { "" } + return "${name} ${cores} ${mem} ${quotaCol}${tag}" } # ── Redraw the visible viewport in-place ─────────────────────── @@ -577,6 +696,9 @@ function Show-VmSelectionMenu { Write-Host "" Write-Section "Select VM size for $PoolName ($($VmSizes.Count) sizes available)" Log-Info "Use $([char]0x2191)/$([char]0x2193) to move, Enter to select, C custom, Esc cancel" + if ($VmSizes.Count -gt 0 -and $VmSizes[0].ContainsKey('QuotaFamilyKnown')) { + Log-Info "Quota column shows cores free in this region (pool max nodes: $MaxNodes)." + } Write-Host "" # Reserve exactly $maxVisible blank lines (viewport size, not total items) @@ -660,17 +782,17 @@ function Show-VmSelectionMenu { # ── GPU quota check ──────────────────────────────────────── if ($IsGpu) { Write-LogMessage -Message "Resolving quota family..." -Symbol $script:Sym.Info -SymbolColor $script:C.Accent -NoNewline - $selectedFamily = Get-QuotaFamilyForVm -VmSize $selectedSku -Location $Location + $skuFamily = $null + $match = $VmSizes | Where-Object { $_.Name -eq $selectedSku } | Select-Object -First 1 + if ($match -and $match.ContainsKey('Family')) { $skuFamily = $match.Family } + $selectedFamily = Get-QuotaFamilyForVm -VmSize $selectedSku -SkuFamily $skuFamily -Location $Location if ($selectedFamily) { Write-Host " $($script:C.Muted)$selectedFamily$($script:C.Reset)" $totalCoresNeeded = $selectedCores * $MaxNodes $quotaResult = Assert-VmQuota -Label $PoolName -Family $selectedFamily -QuotaData $QuotaData -CoresNeeded $totalCoresNeeded if ($quotaResult -in @("zero", "low")) { - $proceed = Read-Host " Continue with this VM anyway? (y = keep, n = re-select) [n]" - if ($proceed -ne 'y' -and $proceed -ne 'Y') { - Log-Warning "Re-showing menu..." - continue - } + Log-Warning "Re-showing menu..." + continue } } else { diff --git a/hooks/common.sh b/hooks/common.sh index 1f2b747..46540d3 100755 --- a/hooks/common.sh +++ b/hooks/common.sh @@ -9,13 +9,27 @@ HOOKS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${HOOKS_DIR}/config.sh" source "${HOOKS_DIR}/ui.sh" +# ── Helpers ───────────────────────────────────────────────────────────────── + +# Persist a key/value via `azd env set`, warning (but not failing) on error. +# Usage: azd_env_set NAME VALUE +azd_env_set() { + local name="$1" + local value="${2:-}" + if ! azd env set "$name" "$value" 2>/dev/null; then + log_warning "Could not persist '${name}' via 'azd env set'." + return 1 + fi + return 0 +} + # ── Prerequisite Checks ───────────────────────────────────────────────────── assert_env_vars() { # Usage: assert_env_vars VAR1 VAR2 VAR3 local missing=() for var in "$@"; do - eval val=\$$var 2>/dev/null || val="" + val="${!var:-}" if [ -z "$val" ]; then missing+=("$var") fi @@ -74,7 +88,7 @@ register_required_providers() { local providers_registering=0 for provider in "${providers[@]}"; do local state - state=$(az provider show -n "$provider" --query "registrationState" -o tsv 2>/dev/null || echo "Unknown") + state=$(az provider show -n "$provider" --query "registrationState" -o tsv 2>/dev/null | tr -d '\r' || echo "Unknown") case "$state" in Registered) log_success "$provider" @@ -126,10 +140,17 @@ connect_aks_cluster() { get_running_pod_count() { # Usage: count=$(get_running_pod_count namespace kube_context) + # Counts both Running and Succeeded pods — some workloads (e.g. the GPU + # operator's cuda-validator / install jobs) intentionally end in Succeeded + # and should not be reported as degraded. local namespace="$1" local kube_context="$2" - kubectl --context "$kube_context" get pods -n "$namespace" \ - --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ' + local running succeeded + running=$(kubectl --context "$kube_context" get pods -n "$namespace" \ + --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l | tr -d ' ') + succeeded=$(kubectl --context "$kube_context" get pods -n "$namespace" \ + --field-selector=status.phase=Succeeded --no-headers 2>/dev/null | wc -l | tr -d ' ') + echo $(( ${running:-0} + ${succeeded:-0} )) } get_total_pod_count() { @@ -151,20 +172,33 @@ test_namespace_exists() { # Fetches VM sizes for a region, filtered by name prefixes, sorted by cores. # Uses az vm list-skus (not list-sizes) to respect subscription restrictions. -# Output: pipe-delimited lines "name|cores|memGB" to stdout. +# Output: pipe-delimited lines "name|cores|memGB|family" to stdout. +# family is the az vm list-skus "family" field — matches az vm list-usage +# name.value so we can look up quota directly. get_filtered_vm_sizes() { local location="$1"; shift local -a prefixes=("$@") - local filter="" - for p in "${prefixes[@]}"; do - [ -n "$filter" ] && filter="${filter} || " - filter="${filter}starts_with(name, '${p}')" - done + + # Build a JMESPath expression like: + # (starts_with(name, 'Standard_D') || starts_with(name, 'Standard_E')) + # so Azure does the prefix filtering server-side instead of piping every + # unrestricted SKU through bash. + local prefix_expr="" + if [ "${#prefixes[@]}" -gt 0 ]; then + local joined="" sep="" + for p in "${prefixes[@]}"; do + joined+="${sep}starts_with(name, \`${p}\`)" + sep=" || " + done + prefix_expr=" && (${joined})" + fi + + local query="[?(restrictions[?type==\`Location\`]|length(@)==\`0\`)${prefix_expr}].{n:name, c:capabilities[?name==\`vCPUs\`].value|[0], m:capabilities[?name==\`MemoryGB\`].value|[0], f:family}" az vm list-skus --location "$location" --resource-type virtualMachines \ - --query "sort_by([?restrictions[?type=='Location']|length(@)==\`0\` && (${filter})], &to_number(capabilities[?name=='vCPUs'].value|[0]))[].{n:name, c:capabilities[?name=='vCPUs'].value|[0], m:capabilities[?name=='MemoryGB'].value|[0]}" \ - -o tsv 2>/dev/null | while IFS=$'\t' read -r name cores memGB; do - echo "${name}|${cores}|${memGB}" - done + --query "$query" -o tsv 2>/dev/null | tr -d '\r' | while IFS=$'\t' read -r name cores memGB family; do + [ -z "$name" ] && continue + echo "${name}|${cores}|${memGB}|${family}" + done | sort -t'|' -k2 -n } # Checks if a default SKU is available; if not, picks the closest by core count. @@ -188,7 +222,7 @@ resolve_default_sku() { # Not available — pick closest by core count log_warning "Default SKU '$default_sku' is not available in this subscription/region." >&2 - local best_sku="" best_diff=999999 + local best_sku="" best_diff=999999 best_cores=0 for entry in "${_sizes[@]}"; do local sku="${entry%%|*}" local rest="${entry#*|}" @@ -231,7 +265,7 @@ select_vm_sizes_for_menu() { local cores="${rest%%|*}" [ "$cores" -lt "$min_cores" ] && continue [ "$cores" -gt "$max_cores" ] && continue - if echo "$sku" | grep -qE "$pattern"; then + if [[ "$sku" =~ $pattern ]]; then if [ -z "${seen[$sku]+x}" ]; then seen[$sku]=1 _out+=("$entry") @@ -250,10 +284,13 @@ select_vm_sizes_for_menu() { done fi - # Sort output by cores - local -a sorted - mapfile -t sorted < <(printf '%s\n' "${_out[@]}" | sort -t'|' -k2 -n) - _out=("${sorted[@]}") + # Sort output by cores (guard: empty array → printf with no args emits a + # bare newline that mapfile captures as one phantom empty-string element) + if [ "${#_out[@]}" -gt 0 ]; then + local -a sorted + mapfile -t sorted < <(printf '%s\n' "${_out[@]}" | sort -t'|' -k2 -n) + _out=("${sorted[@]}") + fi } # Looks up quota for a family directly via az CLI (single call). @@ -267,7 +304,7 @@ lookup_vm_quota() { local raw raw=$(az vm list-usage --location "$location" \ --query "[?name.value=='${family}'] | [0].{l:limit, u:currentValue}" \ - -o tsv 2>/dev/null || true) + -o tsv 2>/dev/null | tr -d '\r' || true) [ -z "$raw" ] && return 1 VM_QUOTA_LIMIT=$(echo "$raw" | cut -f1) VM_QUOTA_USED=$(echo "$raw" | cut -f2) @@ -275,13 +312,21 @@ lookup_vm_quota() { return 0 } -# Resolves quota family for a GPU VM using the pattern lookup table. +# Resolves quota family for a VM size. +# Prefers the sku_family argument (from az vm list-skus "family" field, which +# matches az vm list-usage name.value). Falls back to the regex table in +# GPU_QUOTA_PATTERNS / GPU_QUOTA_FAMILIES for older SKUs with an empty family. GET_QUOTA_FAMILY_RESULT="" get_quota_family_for_vm() { local vm_size="$1" + local sku_family="${2:-}" GET_QUOTA_FAMILY_RESULT="" + if [ -n "$sku_family" ]; then + GET_QUOTA_FAMILY_RESULT="$sku_family" + return 0 + fi for ((i=0; i<${#GPU_QUOTA_PATTERNS[@]}; i++)); do - if echo "$vm_size" | grep -qE "${GPU_QUOTA_PATTERNS[$i]}"; then + if [[ "$vm_size" =~ ${GPU_QUOTA_PATTERNS[$i]} ]]; then GET_QUOTA_FAMILY_RESULT="${GPU_QUOTA_FAMILIES[$i]}" return 0 fi @@ -289,6 +334,87 @@ get_quota_family_for_vm() { return 1 } +# Fetches full per-family quota map for a region in a single call. +# Populates the global associative array VM_QUOTA_MAP[family]="avail|limit". +declare -gA VM_QUOTA_MAP=() +VM_QUOTA_MAP_COUNT=0 +fetch_vm_quota_map() { + local location="$1" + VM_QUOTA_MAP=() + VM_QUOTA_MAP_COUNT=0 + local raw + raw=$(az vm list-usage --location "$location" \ + --query "[].{n:name.value, l:limit, u:currentValue}" \ + -o tsv 2>/dev/null | tr -d '\r' || true) + [ -z "$raw" ] && return 1 + while IFS=$'\t' read -r fam limit used; do + [ -z "$fam" ] && continue + local avail=$((limit - used)) + VM_QUOTA_MAP["$fam"]="${avail}|${limit}" + VM_QUOTA_MAP_COUNT=$((VM_QUOTA_MAP_COUNT + 1)) + done <<< "$raw" + return 0 +} + +# Annotates + filters a VM size array by quota. +# Input entries: "name|cores|memGB|family" (from get_filtered_vm_sizes). +# Output entries: "name|cores|memGB|family|avail|limit|hasEnough" where: +# - avail/limit are empty when family is unknown to the quota map +# - hasEnough is 1/0 (always 1 when family unknown, to avoid hiding SKUs) +# SKUs that can't satisfy $max_nodes are dropped, except $default_sku which is +# kept (annotated) so the user always sees it. If filtering would empty the +# list, the unfiltered annotated set is returned instead. +annotate_vm_sizes_with_quota() { + local -n _list=$1 + local default_sku="${2:-}" + local max_nodes="${3:-1}" + [ "$max_nodes" -lt 1 ] && max_nodes=1 + + local -a annotated=() kept=() + + for entry in "${_list[@]}"; do + local name="${entry%%|*}" + local rest="${entry#*|}" + local cores="${rest%%|*}"; rest="${rest#*|}" + local mem="${rest%%|*}"; rest="${rest#*|}" + local family="${rest}" + + get_quota_family_for_vm "$name" "$family" >/dev/null || true + local fam="$GET_QUOTA_FAMILY_RESULT" + + local avail="" limit="" has_enough=1 family_known=0 + if [ -n "$fam" ] && [ -n "${VM_QUOTA_MAP[$fam]+x}" ]; then + family_known=1 + local qinfo="${VM_QUOTA_MAP[$fam]}" + avail="${qinfo%%|*}" + limit="${qinfo##*|}" + local needed=$((cores * max_nodes)) + if [ "$limit" -le 0 ] || [ "$avail" -lt "$needed" ]; then + has_enough=0 + fi + fi + + local annotated_entry="${name}|${cores}|${mem}|${fam}|${avail}|${limit}|${has_enough}" + annotated+=("$annotated_entry") + + # Keep SKUs with unknown quota family (can't verify, don't hide newer + # SKUs), SKUs with enough quota, or the configured default. + if [ "$family_known" = "0" ] || [ "$has_enough" = "1" ]; then + kept+=("$annotated_entry") + elif [ "$name" = "$default_sku" ]; then + kept+=("$annotated_entry") + fi + done + + # Fall back to the unfiltered list if filtering emptied it + if [ "${#kept[@]}" -eq 0 ]; then + _list=("${annotated[@]}") + else + _list=("${kept[@]}") + fi + return 0 +} + # Checks GPU quota, prints formatted status, sets ASSERT_QUOTA_RESULT. ASSERT_QUOTA_RESULT="" assert_vm_quota() { @@ -343,20 +469,25 @@ resolve_model_quota() { log_info "Checking quota for $model_type in $location..." local model_info + # Query limit + current as TSV (two tab-separated fields, explicit order) — + # avoids fragile awk JSON parsing. model_info=$(az cognitiveservices usage list --location "$location" \ - --query "[?name.value=='$model_type']" --output json 2>/dev/null | tr '[:upper:]' '[:lower:]') + --query "[?name.value=='$model_type'] | [0].{l:limit, u:currentValue}" \ + --output tsv 2>/dev/null | tr -d '\r') - if [ -z "$model_info" ] || [ "$model_info" = "[]" ]; then + if [ -z "$model_info" ]; then log_warning "No quota info found for '$model_type' in '$location'. Skipping quota check." log_info "The model may not be available in this region. Bicep will report a clearer error if so." return 0 fi local current_value limit - current_value=$(echo "$model_info" | awk -F': ' '/"currentvalue"/ {print $2}' | tr -d ',' | tr -d ' ') - limit=$(echo "$model_info" | awk -F': ' '/"limit"/ {print $2}' | tr -d ',' | tr -d ' ') - current_value=$(echo "${current_value:-0}" | cut -d'.' -f1) - limit=$(echo "${limit:-0}" | cut -d'.' -f1) + limit=$(echo "$model_info" | cut -f1) + current_value=$(echo "$model_info" | cut -f2) + current_value=$(echo "${current_value:-0}" | cut -d'.' -f1 | tr -dc '0-9') + limit=$(echo "${limit:-0}" | cut -d'.' -f1 | tr -dc '0-9') + current_value=${current_value:-0} + limit=${limit:-0} local available=$((limit - current_value)) write_key_value "Model" "$model_type" @@ -398,7 +529,36 @@ resolve_model_quota() { # ── Interactive VM Selection Menu ───────────────────────────────────────── # NOTE: This function uses low-level terminal manipulation for interactive -# arrow-key menus. Its internal styling is intentionally left as-is. +# arrow-key menus when a capable terminal is available. When running under +# azd on Windows (pseudo-terminal without raw-mode support) it falls back to +# a simple numbered-list menu that only needs basic line input. + +# Detect whether the terminal supports the interactive arrow-key menu. +# On Windows (MSYS / Git Bash / Cygwin) the pseudo-terminal that azd provides +# passes the stty probe but still breaks on read -rsn1, so we always fall back +# to the simple numbered-list menu there. +_TERM_SUPPORTS_RAW="" +_test_terminal_raw_mode() { + if [ -n "$_TERM_SUPPORTS_RAW" ]; then return "$_TERM_SUPPORTS_RAW"; fi + _TERM_SUPPORTS_RAW=1 # assume unsupported + + # Windows pseudo-terminals (MSYS, Cygwin) don't reliably support raw reads + case "$OSTYPE" in + msys*|cygwin*|mingw*) _TERM_SUPPORTS_RAW=1; return 1 ;; + esac + + if [ -t 0 ] && [ -t 1 ]; then + local _old + _old=$(stty -g 2>/dev/null) || { _TERM_SUPPORTS_RAW=1; return 1; } + if stty raw -echo min 0 time 1 2>/dev/null; then + stty "$_old" 2>/dev/null + _TERM_SUPPORTS_RAW=0 + else + stty "$_old" 2>/dev/null || true + fi + fi + return "$_TERM_SUPPORTS_RAW" +} SELECTED_VM_SKU="" SELECTED_VM_CORES=0 @@ -407,16 +567,14 @@ SELECTED_VM_FAMILY="" show_vm_selection_menu() { local pool_name="$1" local env_var_name="$2" - local -n _vm_array=$3 + local _vm_array_name=$3 local default_sku="$4" local location="$5" local max_nodes="${6:-1}" local is_gpu="${7:-}" - local vm_count=${#_vm_array[@]} - local item_count=$((vm_count + 1)) - local max_visible=20 - [ "$item_count" -lt "$max_visible" ] && max_visible=$item_count + local -n _vm_array_ref=$_vm_array_name + local vm_count=${#_vm_array_ref[@]} if [ "$vm_count" -eq 0 ]; then log_error "No VM sizes available for this pool." @@ -424,13 +582,172 @@ show_vm_selection_menu() { exit 1 fi - # Parse into parallel arrays for fast indexed access - local -a vm_names vm_cores vm_mem - for entry in "${_vm_array[@]}"; do - vm_names+=("${entry%%|*}") - local rest="${entry#*|}" - vm_cores+=("${rest%%|*}") - vm_mem+=("${rest#*|}") + if _test_terminal_raw_mode; then + _show_vm_menu_interactive "$pool_name" "$env_var_name" "$_vm_array_name" "$default_sku" "$location" "$max_nodes" "$is_gpu" + else + _show_vm_menu_simple "$pool_name" "$env_var_name" "$_vm_array_name" "$default_sku" "$location" "$max_nodes" "$is_gpu" + fi +} + +# ── Simple numbered-list fallback (works without raw terminal) ──────────── +_show_vm_menu_simple() { + local pool_name="$1" + local env_var_name="$2" + local -n _svm_array=$3 + local default_sku="$4" + local location="$5" + local max_nodes="${6:-1}" + local is_gpu="${7:-}" + + local vm_count=${#_svm_array[@]} + + # Parse into parallel arrays + local -a vm_names vm_cores vm_mem vm_family vm_avail vm_limit vm_has_enough + local has_quota_info=0 + for entry in "${_svm_array[@]}"; do + IFS='|' read -r -a _f <<< "$entry" + vm_names+=("${_f[0]}") + vm_cores+=("${_f[1]}") + vm_mem+=("${_f[2]}") + vm_family+=("${_f[3]:-}") + vm_avail+=("${_f[4]:-}") + vm_limit+=("${_f[5]:-}") + vm_has_enough+=("${_f[6]:-1}") + if [ -n "${_f[4]:-}" ] || [ -n "${_f[5]:-}" ]; then + has_quota_info=1 + fi + done + + # Find default index + local default_index=0 + for ((i=0; i/dev/null || \ + log_warning "Could not persist ${env_var_name} via 'azd env set'." + + log_success "Selected: $selected_sku" + echo "" + SELECTED_VM_SKU="$selected_sku" + SELECTED_VM_CORES="$selected_cores" + SELECTED_VM_FAMILY="$selected_family" + return 0 + done +} + +# ── Interactive arrow-key menu (requires raw terminal support) ──────────── +_show_vm_menu_interactive() { + local pool_name="$1" + local env_var_name="$2" + local -n _ivm_array=$3 + local default_sku="$4" + local location="$5" + local max_nodes="${6:-1}" + local is_gpu="${7:-}" + + local vm_count=${#_ivm_array[@]} + local item_count=$((vm_count + 1)) + local max_visible=20 + [ "$item_count" -lt "$max_visible" ] && max_visible=$item_count + + # Parse into parallel arrays for fast indexed access. + # Entries are either "name|cores|memGB" (legacy), "name|cores|memGB|family", + # or quota-annotated "name|cores|memGB|family|avail|limit|hasEnough". + local -a vm_names vm_cores vm_mem vm_family vm_avail vm_limit vm_has_enough + local has_quota_info=0 + for entry in "${_ivm_array[@]}"; do + IFS='|' read -r -a _f <<< "$entry" + vm_names+=("${_f[0]}") + vm_cores+=("${_f[1]}") + vm_mem+=("${_f[2]}") + vm_family+=("${_f[3]:-}") + vm_avail+=("${_f[4]:-}") + vm_limit+=("${_f[5]:-}") + vm_has_enough+=("${_f[6]:-1}") + if [ -n "${_f[4]:-}" ] || [ -n "${_f[5]:-}" ]; then + has_quota_info=1 + fi done # Find default index, set initial scroll @@ -465,7 +782,18 @@ show_vm_selection_menu() { [ "${vm_names[$idx]}" = "$default_sku" ] && is_def="1" local text tag="" [ "$is_def" = "1" ] && tag=" (default)" - text=$(printf "%-35s %-10s %-8s%s" "${vm_names[$idx]}" "${vm_cores[$idx]} vCPUs" "${vm_mem[$idx]} GB" "$tag") + + # Quota column + local qcol="" + if [ "$has_quota_info" = "1" ]; then + if [ -n "${vm_avail[$idx]}" ]; then + qcol=$(printf "%-14s" "${vm_avail[$idx]} free") + else + qcol=$(printf "%-14s" "quota n/a") + fi + fi + + text=$(printf "%-35s %-10s %-8s %s%s" "${vm_names[$idx]}" "${vm_cores[$idx]} vCPUs" "${vm_mem[$idx]} GB" "$qcol" "$tag") if [ "$idx" -eq "$selected" ]; then printf "\033[K \033[30;46m > %s \033[0m\n" "$text" elif [ "$is_def" = "1" ]; then @@ -483,23 +811,32 @@ show_vm_selection_menu() { echo "" write_section "Select VM size for ${pool_name} (${vm_count} sizes available)" log_info "Use ↑/↓ to move, Enter to select, C custom, Esc cancel" + if [ "$has_quota_info" = "1" ]; then + log_info "Quota column shows cores free in this region (pool max nodes: ${max_nodes})." + fi echo "" - # Get cursor row via ANSI DSR - local cursor_row - if [ -t 0 ]; then - local old_stty; old_stty=$(stty -g) - stty raw -echo min 0 - printf "\033[6n" > /dev/tty - local response="" - while true; do - local ch; ch=$(dd bs=1 count=1 2>/dev/null) - response="${response}${ch}" - case "$response" in *R) break ;; esac - done - stty "$old_stty" - cursor_row=$(echo "$response" | sed 's/.*\[//;s/;.*//') - else + # Get cursor row via ANSI DSR. + # The stty/dd sequence can fail on Windows pseudo-terminals or when azd + # pipes stdin, so guard against set -e by using a subshell with || true. + local cursor_row="" + if [ -t 0 ] && [ -t 1 ]; then + cursor_row=$( + old_stty=$(stty -g 2>/dev/null) || true + stty raw -echo min 0 2>/dev/null || true + printf "\033[6n" > /dev/tty 2>/dev/null || true + resp="" + for _i in $(seq 1 20); do + ch=$(dd bs=1 count=1 2>/dev/null) || break + resp="${resp}${ch}" + case "$resp" in *R) break ;; esac + done + [ -n "$old_stty" ] && stty "$old_stty" 2>/dev/null || true + echo "$resp" | sed 's/.*\[//;s/;.*//' + ) 2>/dev/null || true + fi + # Fall back if DSR failed or returned garbage + if ! [[ "$cursor_row" =~ ^[0-9]+$ ]]; then cursor_row=10 fi @@ -557,15 +894,23 @@ show_vm_selection_menu() { # GPU quota check if [ "$is_gpu" = "gpu" ]; then _write_log_message "Resolving quota family..." "$SYM_INFO" "$C_ACCENT" "$C_TEXT" false true - get_quota_family_for_vm "$selected_sku" "$location" + local sku_family="" + for ((i=0; i$null | ConvertFrom-Json -ErrorAction SilentlyContinue + $installed = Invoke-AzJson { az extension show --name $ext } if (-not $installed) { Log-Info "Installing Azure CLI extension: $ext..." az extension add --name $ext --yes 2>$null @@ -49,9 +51,9 @@ foreach ($ext in @('connectedk8s', 'k8s-extension')) { # ===================================================== # Authenticate and set subscription # ===================================================== -$EXPIRED_TOKEN = (az ad signed-in-user show --query 'id' -o tsv 2>$null) +$SIGNED_IN_USER_ID = (az ad signed-in-user show --query 'id' -o tsv 2>$null) -if (-not $EXPIRED_TOKEN) { +if (-not $SIGNED_IN_USER_ID) { Log-Warning "No Azure user signed in. Please login." az login -o none } @@ -96,7 +98,10 @@ Log-Success "NVIDIA GPU Operator installed" # Step 3: Connect AKS to Azure Arc # ===================================================== $ARC_CLUSTER_NAME = "${ARC_CLUSTER_PREFIX}$env:AZURE_AKS_CLUSTER_NAME" -# Defensive clamp: Azure Arc cluster names must be <= 63 chars +# Azure Arc cluster name rules: lowercase alnum + hyphen, no leading/trailing hyphen, <=63 chars +$ARC_CLUSTER_NAME = $ARC_CLUSTER_NAME.ToLower() -replace '[^a-z0-9-]', '-' +$ARC_CLUSTER_NAME = $ARC_CLUSTER_NAME -replace '-+', '-' +$ARC_CLUSTER_NAME = $ARC_CLUSTER_NAME.Trim('-') if ($ARC_CLUSTER_NAME.Length -gt 63) { $ARC_CLUSTER_NAME = $ARC_CLUSTER_NAME.Substring(0, 63).TrimEnd('-') } @@ -129,7 +134,7 @@ else { } # Save the Arc cluster name to azd env -azd env set AZURE_ARC_CLUSTER_NAME "$ARC_CLUSTER_NAME" +[void](Invoke-AzdEnvSet -Name 'AZURE_ARC_CLUSTER_NAME' -Value "$ARC_CLUSTER_NAME") # ===================================================== # Step 4: Create Public IP and construct Endpoint URI @@ -147,7 +152,16 @@ if ($env:AZURE_DNS_LABEL) { Log-Info "Reusing existing DNS label: $DNS_LABEL" } else { - $RANDOM_SUFFIX = Get-Random -Minimum 100 -Maximum 1000 + # Deterministic 5-digit suffix derived from env + location + subscription. + # Same inputs always produce same DNS label (idempotent) while collision + # space is 10^5 (vs 900 for the prior Get-Random 100..999). + $dnsHashInput = "$($env:AZURE_ENV_NAME)|$($env:AZURE_LOCATION)|$($env:AZURE_SUBSCRIPTION_ID)" + $sha = [System.Security.Cryptography.SHA256]::Create() + $bytes = $sha.ComputeHash([System.Text.Encoding]::UTF8.GetBytes($dnsHashInput)) + $sha.Dispose() + # Take first 4 bytes as uint32, mod 100000 for a 5-digit decimal suffix + $uint = [System.BitConverter]::ToUInt32($bytes, 0) + $RANDOM_SUFFIX = ('{0:D5}' -f ($uint % 100000)) $DNS_LABEL = "$($env:AZURE_ENV_NAME)$RANDOM_SUFFIX" Log-Info "Generated DNS label: $DNS_LABEL" } @@ -192,9 +206,9 @@ $VIDEO_INDEXER_ENDPOINT_URI = "https://${DNS_LABEL}.$($env:AZURE_LOCATION).cloud Write-KeyValue "Endpoint URI" $VIDEO_INDEXER_ENDPOINT_URI # Persist to azd env -azd env set AZURE_DNS_LABEL "$DNS_LABEL" -azd env set AZURE_STATIC_IP "$STATIC_IP" -azd env set AZURE_VIDEO_INDEXER_ENDPOINT_URI "$VIDEO_INDEXER_ENDPOINT_URI" +[void](Invoke-AzdEnvSet -Name 'AZURE_DNS_LABEL' -Value "$DNS_LABEL") +[void](Invoke-AzdEnvSet -Name 'AZURE_STATIC_IP' -Value "$STATIC_IP") +[void](Invoke-AzdEnvSet -Name 'AZURE_VIDEO_INDEXER_ENDPOINT_URI' -Value "$VIDEO_INDEXER_ENDPOINT_URI") # ===================================================== # Step 5: Enable App Routing (HTTP only) @@ -294,7 +308,9 @@ Log-Success "Cert Manager extension deployed" # ===================================================== Log-Step -Number 9 -Total $totalSteps -Title "Deploying Video Indexer Arc Extension" -$inferenceAgentEnabled = if ($env:CREATE_FOUNDRY_PROJECT -eq 'true') { 'false' } else { 'true' } +# Normalize to true by default; only explicit "false" disables Foundry. +$createFoundry = if ($env:CREATE_FOUNDRY_PROJECT -eq 'false') { 'false' } else { 'true' } +$inferenceAgentEnabled = if ($createFoundry -eq 'false') { 'true' } else { 'false' } $mediaStreamerEnabled = if ($env:MEDIA_STREAMER_ENABLED -eq 'false') { 'false' } else { 'true' } Log-Info "Deploying VI Arc extension (this may take several minutes)..." @@ -308,8 +324,8 @@ az deployment group create ` videoIndexerEndpointUri="$VIDEO_INDEXER_ENDPOINT_URI" ` deepstreamNodeSelectorValue="$env:AZURE_DEEPSTREAM_NODE_SELECTOR_VALUE" ` inferenceNodeSelectorValue="$env:AZURE_INFERENCE_NODE_SELECTOR_VALUE" ` - inferenceAgentEnabled=$inferenceAgentEnabled ` - mediaStreamerEnabled=$mediaStreamerEnabled + inferenceAgentEnabled="$inferenceAgentEnabled" ` + mediaStreamerEnabled="$mediaStreamerEnabled" Log-Success "Video Indexer Arc extension deployed" diff --git a/hooks/postprovision.sh b/hooks/postprovision.sh index 2f01ab6..e7c0be8 100755 --- a/hooks/postprovision.sh +++ b/hooks/postprovision.sh @@ -4,8 +4,7 @@ # Post-Provision Script: Connect AKS to Azure Arc and deploy VI extension # ============================================================================= -set -e - +set -eo pipefail source "$(dirname "$0")/common.sh" TOTAL_STEPS=12 @@ -13,6 +12,8 @@ TOTAL_STEPS=12 write_foundry_banner "Post-Provision Setup" if [ "$CREATE_IN_LOCAL" = "false" ]; then + # CREATE_IN_LOCAL=false is set by CI workflows that provision infra via + # dedicated pipelines and do not want the local azd hook to run. log_info "Skipping postprovision script for non-local deployment." exit 0 fi @@ -43,9 +44,9 @@ done # ===================================================== # Authenticate and set subscription # ===================================================== -EXPIRED_TOKEN=$(az ad signed-in-user show --query 'id' -o tsv 2>/dev/null || true) +SIGNED_IN_USER_ID=$(az ad signed-in-user show --query 'id' -o tsv 2>/dev/null | tr -d '\r' || true) -if [ -z "$EXPIRED_TOKEN" ]; then +if [ -z "$SIGNED_IN_USER_ID" ]; then log_warning "No Azure user signed in. Please login." az login -o none fi @@ -90,9 +91,12 @@ log_success "NVIDIA GPU Operator installed" # Step 3: Connect AKS to Azure Arc # ===================================================== ARC_CLUSTER_NAME="${ARC_CLUSTER_PREFIX}${AZURE_AKS_CLUSTER_NAME}" -# Defensive clamp: Azure Arc cluster names must be <= 63 chars +# Azure Arc cluster name rules: lowercase alnum + hyphen, no leading/trailing hyphen, <=63 chars +ARC_CLUSTER_NAME=$(echo "$ARC_CLUSTER_NAME" | tr '[:upper:]' '[:lower:]' | tr -c 'a-z0-9-' '-') +ARC_CLUSTER_NAME=$(echo "$ARC_CLUSTER_NAME" | sed -E 's/-+/-/g; s/^-+//; s/-+$//') if [ ${#ARC_CLUSTER_NAME} -gt 63 ]; then - ARC_CLUSTER_NAME=$(echo "${ARC_CLUSTER_NAME:0:63}" | sed 's/-$//') + ARC_CLUSTER_NAME="${ARC_CLUSTER_NAME:0:63}" + ARC_CLUSTER_NAME=$(echo "$ARC_CLUSTER_NAME" | sed -E 's/-+$//') fi log_step 3 $TOTAL_STEPS "Connecting AKS to Azure Arc" @@ -102,7 +106,7 @@ write_key_value "Arc cluster name" "$ARC_CLUSTER_NAME" ARC_EXISTS=$(az connectedk8s show \ --name "$ARC_CLUSTER_NAME" \ --resource-group "$AZURE_RESOURCE_GROUP" \ - --query "name" -o tsv 2>/dev/null || true) + --query "name" -o tsv 2>/dev/null | tr -d '\r' || true) if [ -n "$ARC_EXISTS" ]; then log_success "Arc-connected cluster already exists. Skipping." @@ -116,7 +120,7 @@ else fi # Save the Arc cluster name to azd env -azd env set AZURE_ARC_CLUSTER_NAME "$ARC_CLUSTER_NAME" +azd_env_set AZURE_ARC_CLUSTER_NAME "$ARC_CLUSTER_NAME" || true # ===================================================== # Step 4: Create Public IP and construct Endpoint URI @@ -133,7 +137,18 @@ if [ -n "$AZURE_DNS_LABEL" ]; then DNS_LABEL="$AZURE_DNS_LABEL" log_info "Reusing existing DNS label: $DNS_LABEL" else - RANDOM_SUFFIX=$((RANDOM % 900 + 100)) + # Deterministic 5-digit suffix derived from env + location + subscription. + # Same inputs always produce same DNS label (idempotent) while collision + # space is 10^5 (vs 900 for the prior RANDOM % 900 + 100). + DNS_HASH_INPUT="${AZURE_ENV_NAME}|${AZURE_LOCATION}|${AZURE_SUBSCRIPTION_ID:-}" + if command -v sha256sum >/dev/null 2>&1; then + DNS_HASH=$(printf '%s' "$DNS_HASH_INPUT" | sha256sum | cut -c1-10) + else + DNS_HASH=$(printf '%s' "$DNS_HASH_INPUT" | shasum -a 256 | cut -c1-10) + fi + # Convert hex slice to decimal and take 5 digits + RANDOM_SUFFIX=$(printf '%d' "0x${DNS_HASH}" 2>/dev/null | tr -dc '0-9' | head -c 5) + RANDOM_SUFFIX=${RANDOM_SUFFIX:-00000} DNS_LABEL="${AZURE_ENV_NAME}${RANDOM_SUFFIX}" log_info "Generated DNS label: $DNS_LABEL" fi @@ -144,7 +159,7 @@ PUBLIC_IP_NAME="${AZURE_ENV_NAME}-inbound-ip" PUBLIC_IP_EXISTS=$(az network public-ip show \ --resource-group "$AKS_MC_RG" \ --name "$PUBLIC_IP_NAME" \ - --query "name" -o tsv 2>/dev/null || true) + --query "name" -o tsv 2>/dev/null | tr -d '\r' || true) if [ -n "$PUBLIC_IP_EXISTS" ]; then log_success "Public IP '$PUBLIC_IP_NAME' already exists. Skipping." @@ -163,7 +178,7 @@ fi STATIC_IP=$(az network public-ip show \ --resource-group "$AKS_MC_RG" \ --name "$PUBLIC_IP_NAME" \ - --query "ipAddress" -o tsv) + --query "ipAddress" -o tsv | tr -d '\r') write_key_value "Static IP" "$STATIC_IP" @@ -171,9 +186,9 @@ VIDEO_INDEXER_ENDPOINT_URI="https://${DNS_LABEL}.${AZURE_LOCATION}.cloudapp.azur write_key_value "Endpoint URI" "$VIDEO_INDEXER_ENDPOINT_URI" # Persist to azd env -azd env set AZURE_DNS_LABEL "${DNS_LABEL}" -azd env set AZURE_STATIC_IP "${STATIC_IP}" -azd env set AZURE_VIDEO_INDEXER_ENDPOINT_URI "${VIDEO_INDEXER_ENDPOINT_URI}" +azd_env_set AZURE_DNS_LABEL "${DNS_LABEL}" || true +azd_env_set AZURE_STATIC_IP "${STATIC_IP}" || true +azd_env_set AZURE_VIDEO_INDEXER_ENDPOINT_URI "${VIDEO_INDEXER_ENDPOINT_URI}" || true # ===================================================== # Step 5: Enable App Routing (HTTP only) @@ -183,7 +198,7 @@ log_step 5 $TOTAL_STEPS "Enabling App Routing on AKS Cluster" APPROUTING_ENABLED=$(az aks show \ --resource-group "$AZURE_RESOURCE_GROUP" \ --name "$AZURE_AKS_CLUSTER_NAME" \ - --query "ingressProfile.webAppRouting.enabled" -o tsv 2>/dev/null || true) + --query "ingressProfile.webAppRouting.enabled" -o tsv 2>/dev/null | tr -d '\r' || true) if [ "$APPROUTING_ENABLED" = "true" ]; then log_success "App Routing already enabled. Skipping." @@ -258,10 +273,12 @@ log_success "Cert Manager extension deployed" # ===================================================== log_step 9 $TOTAL_STEPS "Deploying Video Indexer Arc Extension" -if [ "$CREATE_FOUNDRY_PROJECT" = "true" ]; then - INFERENCE_AGENT_ENABLED="false" -else +# Normalize to true by default; only explicit "false" disables Foundry. +CREATE_FOUNDRY_PROJECT="${CREATE_FOUNDRY_PROJECT:-true}" +if [ "$CREATE_FOUNDRY_PROJECT" = "false" ]; then INFERENCE_AGENT_ENABLED="true" +else + INFERENCE_AGENT_ENABLED="false" fi MEDIA_STREAMER_ENABLED="${MEDIA_STREAMER_ENABLED:-true}" @@ -277,8 +294,8 @@ az deployment group create \ videoIndexerEndpointUri="$VIDEO_INDEXER_ENDPOINT_URI" \ deepstreamNodeSelectorValue="$AZURE_DEEPSTREAM_NODE_SELECTOR_VALUE" \ inferenceNodeSelectorValue="$AZURE_INFERENCE_NODE_SELECTOR_VALUE" \ - inferenceAgentEnabled=$INFERENCE_AGENT_ENABLED \ - mediaStreamerEnabled=$MEDIA_STREAMER_ENABLED + inferenceAgentEnabled="$INFERENCE_AGENT_ENABLED" \ + mediaStreamerEnabled="$MEDIA_STREAMER_ENABLED" log_success "Video Indexer Arc extension deployed" log_info "Assigning permissions to Arc extension managed identity..." @@ -287,7 +304,7 @@ PRINCIPAL_ID=$(az k8s-extension show \ --cluster-name "$ARC_CLUSTER_NAME" \ --cluster-type connectedClusters \ --name videoindexer \ - --query "identity.principalId" -o tsv 2>/dev/null || true) + --query "identity.principalId" -o tsv 2>/dev/null | tr -d '\r' || true) ACCOUNT_RESOURCE_ID="$AZURE_VIDEO_INDEXER_ACCOUNT_RESOURCE_ID" @@ -305,7 +322,7 @@ else --assignee "$PRINCIPAL_ID" \ --role Contributor \ --scope "$ACCOUNT_RESOURCE_ID" \ - --query "[0].id" -o tsv 2>/dev/null || true) + --query "[0].id" -o tsv 2>/dev/null | tr -d '\r' || true) if [ -n "$EXISTING_ASSIGNMENT" ]; then log_success "Role assignment already exists. Skipping." @@ -332,7 +349,7 @@ VI_EXT_STATE=$(az k8s-extension show \ --cluster-name "$ARC_CLUSTER_NAME" \ --cluster-type connectedClusters \ --name videoindexer \ - --query "provisioningState" -o tsv 2>/dev/null || echo "Unknown") + --query "provisioningState" -o tsv 2>/dev/null | tr -d '\r' || echo "Unknown") # ===================================================== # Acquire VI Extension Access Token (used by Steps 10-11) @@ -351,7 +368,7 @@ else --cluster-name "$ARC_CLUSTER_NAME" \ --cluster-type connectedClusters \ --name videoindexer \ - --query "id" -o tsv 2>/dev/null || true) + --query "id" -o tsv 2>/dev/null | tr -d '\r' || true) if [ -z "$EXTENSION_ID" ]; then log_warning "Failed to retrieve VI extension ID. Skipping camera and agent job setup." @@ -362,7 +379,7 @@ else VI_ACCESS_TOKEN=$(az rest --method post --url "$TOKEN_URL" \ --body "$TOKEN_BODY" \ - --query "accessToken" -o tsv 2>/dev/null || true) + --query "accessToken" -o tsv 2>/dev/null | tr -d '\r' || true) if [ -z "$VI_ACCESS_TOKEN" ]; then log_warning "Failed to generate VI extension access token. Skipping camera and agent job setup." @@ -459,7 +476,7 @@ log_step 12 $TOTAL_STEPS "Running Post-Deployment Health Checks" ARC_STATUS=$(az connectedk8s show \ --name "$ARC_CLUSTER_NAME" \ --resource-group "$AZURE_RESOURCE_GROUP" \ - --query "connectivityStatus" -o tsv 2>/dev/null || echo "unknown") + --query "connectivityStatus" -o tsv 2>/dev/null | tr -d '\r' || echo "unknown") ARC_HEALTH="Pass"; [ "$ARC_STATUS" != "Connected" ] && ARC_HEALTH="Warn" write_health_row "Arc connection" "$ARC_HEALTH" "$ARC_STATUS" diff --git a/hooks/postup.ps1 b/hooks/postup.ps1 index d8f9335..e99c241 100644 --- a/hooks/postup.ps1 +++ b/hooks/postup.ps1 @@ -64,7 +64,7 @@ if ($HasClusterAccess) { $HealthResults += @{ Name = "Cluster nodes"; Status = $nodeStatus; Detail = $nodeDetail } try { - $gpuNodes = (kubectl --context $KubeContext get nodes -l "accelerator=nvidia" --no-headers 2>$null | Measure-Object -Line).Lines + $gpuNodes = (kubectl --context $KubeContext get nodes -l "nvidia.com/gpu.present=true" --no-headers 2>$null | Measure-Object -Line).Lines if ($gpuNodes -gt 0) { $gpuNodeStatus = "Pass" $gpuNodeDetail = "$gpuNodes detected" diff --git a/hooks/postup.sh b/hooks/postup.sh index ed9e758..310295e 100755 --- a/hooks/postup.sh +++ b/hooks/postup.sh @@ -4,8 +4,7 @@ # Post-Up Script: Deployment health dashboard and next steps # ============================================================================= -set -e - +set -eo pipefail source "$(dirname "$0")/common.sh" TOTAL_STEPS=6 @@ -51,18 +50,21 @@ log_step 1 $TOTAL_STEPS "AKS Cluster Health" if [ "$HAS_CLUSTER_ACCESS" = "true" ]; then AKS_STATE=$(az aks show -g "$AZURE_RESOURCE_GROUP" -n "$AZURE_AKS_CLUSTER_NAME" \ - --query "provisioningState" -o tsv 2>/dev/null || echo "Unknown") + --query "provisioningState" -o tsv 2>/dev/null | tr -d '\r' || echo "Unknown") AKS_STATUS="Pass"; [ "$AKS_STATE" != "Succeeded" ] && AKS_STATUS="Fail" write_health_row "AKS provisioning state" "$AKS_STATUS" "$AKS_STATE" _track_health "$AKS_STATUS" TOTAL_NODES=$(kubectl --context "$KUBE_CONTEXT" get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ') - READY_NODES=$(kubectl --context "$KUBE_CONTEXT" get nodes --no-headers 2>/dev/null | grep -c " Ready " || echo "0") + READY_NODES=$(kubectl --context "$KUBE_CONTEXT" get nodes --no-headers 2>/dev/null | grep -c " Ready " ; true) + READY_NODES=$(echo "$READY_NODES" | tr -dc '0-9' | head -c 6) + READY_NODES=${READY_NODES:-0} NODE_STATUS="Pass"; [ "$READY_NODES" != "$TOTAL_NODES" ] && NODE_STATUS="Warn" write_health_row "Cluster nodes" "$NODE_STATUS" "${READY_NODES}/${TOTAL_NODES} Ready" _track_health "$NODE_STATUS" - GPU_NODES=$(kubectl --context "$KUBE_CONTEXT" get nodes -l "accelerator=nvidia" --no-headers 2>/dev/null | wc -l | tr -d ' ' || echo "0") + GPU_NODES=$(kubectl --context "$KUBE_CONTEXT" get nodes -l "nvidia.com/gpu.present=true" --no-headers 2>/dev/null | wc -l | tr -d ' ') + GPU_NODES=${GPU_NODES:-0} GPU_NODE_STATUS="Pass"; [ "$GPU_NODES" -eq 0 ] && GPU_NODE_STATUS="Warn" GPU_NODE_DETAIL="$GPU_NODES detected" [ "$GPU_NODES" -eq 0 ] && GPU_NODE_DETAIL="none detected (may still be provisioning)" @@ -107,7 +109,7 @@ if [ -n "$ARC_CLUSTER_NAME" ]; then ARC_STATUS=$(az connectedk8s show \ --name "$ARC_CLUSTER_NAME" \ --resource-group "$AZURE_RESOURCE_GROUP" \ - --query "connectivityStatus" -o tsv 2>/dev/null || echo "Unknown") + --query "connectivityStatus" -o tsv 2>/dev/null | tr -d '\r' || echo "Unknown") ARC_HEALTH="Pass"; [ "$ARC_STATUS" != "Connected" ] && ARC_HEALTH="Warn" write_health_row "Arc connection" "$ARC_HEALTH" "$ARC_CLUSTER_NAME ($ARC_STATUS)" _track_health "$ARC_HEALTH" @@ -144,8 +146,15 @@ if [ "$HAS_CLUSTER_ACCESS" = "true" ]; then if [ -n "${AZURE_DNS_LABEL:-}" ] && [ -n "${AZURE_LOCATION:-}" ]; then FQDN="${AZURE_DNS_LABEL}.${AZURE_LOCATION}.cloudapp.azure.com" - DNS_RESOLVED=$(host "$FQDN" 2>/dev/null | grep -c "has address" || \ - nslookup "$FQDN" 2>/dev/null | grep -c "Address:" || echo "0") + if command -v host >/dev/null 2>&1; then + DNS_RESOLVED=$(host "$FQDN" 2>/dev/null | grep -c "has address" ; true) + elif command -v nslookup >/dev/null 2>&1; then + DNS_RESOLVED=$(nslookup "$FQDN" 2>/dev/null | grep -c "^Address:" ; true) + else + DNS_RESOLVED=0 + fi + DNS_RESOLVED=$(echo "${DNS_RESOLVED:-0}" | tr -dc '0-9' | head -c 6) + DNS_RESOLVED=${DNS_RESOLVED:-0} if [ "$DNS_RESOLVED" -gt 0 ]; then write_health_row "DNS resolution" "Pass" "$FQDN" _track_health "Pass" @@ -169,7 +178,7 @@ if [ -n "$ARC_CLUSTER_NAME" ]; then --cluster-name "$ARC_CLUSTER_NAME" \ --cluster-type connectedClusters \ --name videoindexer \ - --query "provisioningState" -o tsv 2>/dev/null || echo "Unknown") + --query "provisioningState" -o tsv 2>/dev/null | tr -d '\r' || echo "Unknown") VI_EXT_STATUS="Pass"; [ "$VI_STATE" != "Succeeded" ] && VI_EXT_STATUS="Warn" write_health_row "VI Extension" "$VI_EXT_STATUS" "$VI_STATE" _track_health "$VI_EXT_STATUS" diff --git a/hooks/predown.ps1 b/hooks/predown.ps1 deleted file mode 100644 index 40e6400..0000000 --- a/hooks/predown.ps1 +++ /dev/null @@ -1,77 +0,0 @@ -# ============================================================================= -# Pre-Down Cleanup: Remove resources created outside azd's tracked scope -# ============================================================================= -# The postprovision hook creates a Public IP in the AKS node resource group -# (MC_ group), which azd down does not track. This hook cleans it up before -# the resource group is deleted. -# ============================================================================= - -$ErrorActionPreference = "Stop" - -Write-Host "" -Write-Host " Pre-down cleanup: removing untracked resources..." -ForegroundColor Cyan -Write-Host "" - -# ── Public IP in the AKS node resource group ──────────────────────────────── -$AKS_MC_RG = $env:AZURE_AKS_NODE_RESOURCE_GROUP -$PUBLIC_IP_NAME = "$($env:AZURE_ENV_NAME)-inbound-ip" - -if (-not $AKS_MC_RG -or -not $env:AZURE_ENV_NAME) { - Write-Host " [SKIP] Missing AZURE_AKS_NODE_RESOURCE_GROUP or AZURE_ENV_NAME. Nothing to clean up." -ForegroundColor Yellow - exit 0 -} - -$exists = $null -try { - $exists = (az network public-ip show ` - --resource-group "$AKS_MC_RG" ` - --name "$PUBLIC_IP_NAME" ` - --query "name" -o tsv 2>$null) -} -catch { - $exists = $null -} - -if ($exists) { - Write-Host " Deleting public IP '$PUBLIC_IP_NAME' from '$AKS_MC_RG'..." - az network public-ip delete ` - --resource-group "$AKS_MC_RG" ` - --name "$PUBLIC_IP_NAME" 2>$null - Write-Host " [OK] Public IP deleted." -ForegroundColor Green -} -else { - Write-Host " [SKIP] Public IP '$PUBLIC_IP_NAME' not found in '$AKS_MC_RG'. Nothing to clean up." -ForegroundColor Yellow -} - -# ── Arc-connected cluster ──────────────────────────────────────────────────── -$ARC_CLUSTER_NAME = $env:AZURE_ARC_CLUSTER_NAME -$RESOURCE_GROUP = $env:AZURE_RESOURCE_GROUP - -if ($ARC_CLUSTER_NAME -and $RESOURCE_GROUP) { - $arcExists = $null - try { - $arcExists = (az connectedk8s show ` - --name "$ARC_CLUSTER_NAME" ` - --resource-group "$RESOURCE_GROUP" ` - --query "name" -o tsv 2>$null) - } - catch { - $arcExists = $null - } - - if ($arcExists) { - Write-Host " Disconnecting Arc cluster '$ARC_CLUSTER_NAME'..." - az connectedk8s delete ` - --name "$ARC_CLUSTER_NAME" ` - --resource-group "$RESOURCE_GROUP" ` - --yes 2>$null - Write-Host " [OK] Arc cluster disconnected." -ForegroundColor Green - } - else { - Write-Host " [SKIP] Arc cluster '$ARC_CLUSTER_NAME' not found. Nothing to clean up." -ForegroundColor Yellow - } -} - -Write-Host "" -Write-Host " Pre-down cleanup complete." -ForegroundColor Green -Write-Host "" diff --git a/hooks/predown.sh b/hooks/predown.sh deleted file mode 100755 index df3e4da..0000000 --- a/hooks/predown.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -# ============================================================================= -# Pre-Down Cleanup: Remove resources created outside azd's tracked scope -# ============================================================================= -# The postprovision hook creates a Public IP in the AKS node resource group -# (MC_ group), which azd down does not track. This hook cleans it up before -# the resource group is deleted. -# ============================================================================= - -set -e - -echo "" -echo " Pre-down cleanup: removing untracked resources..." -echo "" - -# ── Public IP in the AKS node resource group ──────────────────────────────── -AKS_MC_RG="${AZURE_AKS_NODE_RESOURCE_GROUP}" -PUBLIC_IP_NAME="${AZURE_ENV_NAME}-inbound-ip" - -if [ -z "$AKS_MC_RG" ] || [ -z "$AZURE_ENV_NAME" ]; then - echo " [SKIP] Missing AZURE_AKS_NODE_RESOURCE_GROUP or AZURE_ENV_NAME. Nothing to clean up." - exit 0 -fi - -IP_EXISTS=$(az network public-ip show \ - --resource-group "$AKS_MC_RG" \ - --name "$PUBLIC_IP_NAME" \ - --query "name" -o tsv 2>/dev/null || true) - -if [ -n "$IP_EXISTS" ]; then - echo " Deleting public IP '$PUBLIC_IP_NAME' from '$AKS_MC_RG'..." - az network public-ip delete \ - --resource-group "$AKS_MC_RG" \ - --name "$PUBLIC_IP_NAME" 2>/dev/null - echo " [OK] Public IP deleted." -else - echo " [SKIP] Public IP '$PUBLIC_IP_NAME' not found in '$AKS_MC_RG'. Nothing to clean up." -fi - -# ── Arc-connected cluster ──────────────────────────────────────────────────── -ARC_CLUSTER_NAME="${AZURE_ARC_CLUSTER_NAME}" -RESOURCE_GROUP="${AZURE_RESOURCE_GROUP}" - -if [ -n "$ARC_CLUSTER_NAME" ] && [ -n "$RESOURCE_GROUP" ]; then - ARC_EXISTS=$(az connectedk8s show \ - --name "$ARC_CLUSTER_NAME" \ - --resource-group "$RESOURCE_GROUP" \ - --query "name" -o tsv 2>/dev/null || true) - - if [ -n "$ARC_EXISTS" ]; then - echo " Disconnecting Arc cluster '$ARC_CLUSTER_NAME'..." - az connectedk8s delete \ - --name "$ARC_CLUSTER_NAME" \ - --resource-group "$RESOURCE_GROUP" \ - --yes 2>/dev/null - echo " [OK] Arc cluster disconnected." - else - echo " [SKIP] Arc cluster '$ARC_CLUSTER_NAME' not found. Nothing to clean up." - fi -fi - -echo "" -echo " Pre-down cleanup complete." -echo "" diff --git a/hooks/preprovision.ps1 b/hooks/preprovision.ps1 index ec854b7..5c22a2e 100644 --- a/hooks/preprovision.ps1 +++ b/hooks/preprovision.ps1 @@ -41,7 +41,7 @@ if ($env:AZURE_SUBSCRIPTION_ID) { } $subName = (az account show --query "name" -o tsv 2>$null) Log-Success "Subscription: $subName" - Log-Success "ID" $env:AZURE_SUBSCRIPTION_ID + Log-Success "ID: $env:AZURE_SUBSCRIPTION_ID" } # ===================================================== @@ -75,14 +75,14 @@ if ($storageSku -eq 'Standard_ZRS') { Log-Info "Checking ZRS availability in $($env:AZURE_LOCATION)..." $zrsAvailable = $false try { - $skuInfo = (az provider show --namespace Microsoft.Storage ` - --query "resourceTypes[?resourceType=='storageAccounts'].zoneMappings[?contains(location, '$($env:AZURE_LOCATION)')].location | [0][0]" ` + $skuInfo = (az storage account list-skus --location $env:AZURE_LOCATION ` + --query "[?name=='Standard_ZRS'].name | [0]" ` -o tsv 2>$null) - if ($skuInfo) { $zrsAvailable = $true } + if (-not [string]::IsNullOrWhiteSpace($skuInfo)) { $zrsAvailable = $true } } - catch { } + catch { $zrsAvailable = $false } if (-not $zrsAvailable) { - Log-Warning "Standard_ZRS may not be available in '$($env:AZURE_LOCATION)'." + Log-Warning "Requested Standard_ZRS is not available in '$($env:AZURE_LOCATION)'." Log-Warning "Falling back to Standard_LRS to avoid deployment failure." azd env set STORAGE_SKU_NAME Standard_LRS 2>$null $storageSku = 'Standard_LRS' @@ -91,7 +91,52 @@ if ($storageSku -eq 'Standard_ZRS') { Log-Success "ZRS is available in $($env:AZURE_LOCATION)" } } -Write-KeyValue "Storage SKU" $storageSku +Write-KeyValue "STORAGE_SKU" $storageSku + +# ── Validate Kubernetes version against region capabilities ────────────── +# Pin minors drift from standard support to LTS-only (e.g. 1.32 → LTS). +# If the requested minor is not available on the standard "KubernetesOfficial" +# support plan in this region, auto-fall back to the region's default minor. +$requestedK8s = if ($env:KUBERNETES_VERSION) { $env:KUBERNETES_VERSION } else { '1.34' } +$versions = Invoke-AzJson { az aks get-versions --location $env:AZURE_LOCATION -o json } +if ($versions -and $versions.values) { + $standardVersions = @($versions.values | Where-Object { + $_.capabilities.supportPlan -contains 'KubernetesOfficial' + } | ForEach-Object { $_.version }) + $defaultVersion = ($versions.values | Where-Object { $_.isDefault } | Select-Object -First 1).version + if ($standardVersions -notcontains $requestedK8s) { + if ($defaultVersion) { + Log-Warning "Kubernetes '$requestedK8s' is not on standard support in '$($env:AZURE_LOCATION)'." + Log-Warning "Falling back to region default: '$defaultVersion'." + [void](Invoke-AzdEnvSet -Name 'KUBERNETES_VERSION' -Value $defaultVersion) + $env:KUBERNETES_VERSION = $defaultVersion + $requestedK8s = $defaultVersion + } + else { + Log-Warning "Kubernetes '$requestedK8s' is not on standard support and no default could be resolved." + } + } + Write-KeyValue "KUBERNETES_VERSION" $requestedK8s +} +else { + Log-Warning "Could not query AKS versions in '$($env:AZURE_LOCATION)'. Proceeding with '$requestedK8s'." +} + +# Persist the resolved CREATE_FOUNDRY_PROJECT value so Bicep + all downstream +# hooks agree on the same boolean. (Bicep's default is true; the hooks default +# to true to match. Without this step, a step that reads $env:... raw would +# see the empty string and take the 'false' branch.) +$resolvedFoundry = if ($CREATE_FOUNDRY_PROJECT) { 'true' } else { 'false' } +if ($env:CREATE_FOUNDRY_PROJECT -ne $resolvedFoundry) { + try { + azd env set CREATE_FOUNDRY_PROJECT $resolvedFoundry 2>$null + $env:CREATE_FOUNDRY_PROJECT = $resolvedFoundry + Write-KeyValue "CREATE_FOUNDRY_PROJECT" $resolvedFoundry + } + catch { + Log-Warning "Could not persist CREATE_FOUNDRY_PROJECT via 'azd env set'." + } +} # ===================================================== # Step 4: Resolve AI Model Quota (if Foundry enabled) @@ -124,7 +169,34 @@ if ($allVmSizes.Count -eq 0) { exit 1 } -Log-Success "Found $($allVmSizes.Count) VM sizes in $($env:AZURE_LOCATION)" +$cpuCount = @($allVmSizes | Where-Object { $n = $_.Name; ($CPU_VM_PREFIXES | Where-Object { $n.StartsWith($_) }).Count -gt 0 }).Count +$gpuCount = @($allVmSizes | Where-Object { $n = $_.Name; ($GPU_VM_PREFIXES | Where-Object { $n.StartsWith($_) }).Count -gt 0 }).Count +Log-Success "Found VM SKUs in $($env:AZURE_LOCATION)" +Log-Info "$cpuCount CPU + $gpuCount GPU sizes available" + +if ($cpuCount -eq 0 -and $gpuCount -eq 0) { + Log-Error "No VM sizes are available in '$($env:AZURE_LOCATION)' for this subscription." + Log-Info "This usually means the region has restrictive SKU policies for your subscription." + Log-Info "Try a different region: run 'azd env set AZURE_LOCATION ' and re-run 'azd up'." + Log-Info "Recommended regions: eastus2, westus3, southcentralus, northeurope" + exit 1 +} + +if ($cpuCount -eq 0) { + Log-Error "No CPU VM sizes (D/E/F-series) are available in '$($env:AZURE_LOCATION)'." + Log-Info "AKS requires CPU nodes for system and workload pools." + Log-Info "Try a different region: run 'azd env set AZURE_LOCATION ' and re-run 'azd up'." + Log-Info "Recommended regions: eastus2, westus3, southcentralus, northeurope" + exit 1 +} + +if ($gpuCount -eq 0) { + Log-Warning "No GPU VM sizes (NC/NV/ND-series) are available in '$($env:AZURE_LOCATION)'." + Log-Info "GPU pools are required for video processing. Consider a region with GPU support." + Log-Info "Try: run 'azd env set AZURE_LOCATION ' and re-run 'azd up'." + Log-Info "Recommended GPU regions: eastus2, westus3, southcentralus, northeurope" + exit 1 +} # Determine current/default values (env var overrides config default) $currentSystemVm = if ($env:SYSTEM_VM_SIZE) { $env:SYSTEM_VM_SIZE } else { $DEFAULT_SYSTEM_VM_SIZE } @@ -132,35 +204,44 @@ $currentWorkloadVm = if ($env:WORKLOAD_VM_SIZE) { $env:WORKLOAD_VM_SIZE $currentDeepstreamVm = if ($env:DEEPSTREAM_GPU_VM_SIZE) { $env:DEEPSTREAM_GPU_VM_SIZE } else { $DEFAULT_DEEPSTREAM_GPU_SIZE } $currentInferenceVm = if ($env:INFERENCE_GPU_VM_SIZE) { $env:INFERENCE_GPU_VM_SIZE } else { $DEFAULT_INFERENCE_GPU_SIZE } -# Pick recommended VM sizes per pool (4 families x 3 sizes = ~12 items each) -$systemVmSizes = Select-VmSizesForMenu -AllSizes $allVmSizes -Prefixes $CPU_VM_PREFIXES -RecommendedFamilies $SYSTEM_RECOMMENDED_FAMILIES -DefaultSku $currentSystemVm -SizesPerFamily $SIZES_PER_FAMILY -MaxCores $SYSTEM_MAX_CORES -$workloadVmSizes = Select-VmSizesForMenu -AllSizes $allVmSizes -Prefixes $CPU_VM_PREFIXES -RecommendedFamilies $WORKLOAD_RECOMMENDED_FAMILIES -DefaultSku $currentWorkloadVm -SizesPerFamily $SIZES_PER_FAMILY -MinCores $WORKLOAD_MIN_CORES -$gpuVmSizes = Select-VmSizesForMenu -AllSizes $allVmSizes -Prefixes $GPU_VM_PREFIXES -RecommendedFamilies $GPU_RECOMMENDED_FAMILIES -DefaultSku $currentDeepstreamVm -SizesPerFamily $SIZES_PER_FAMILY +Log-Info "Querying VM quota in region..." +$quotaData = Get-AzVmQuotaForRegion -Location $env:AZURE_LOCATION +if ($quotaData -and $quotaData.Keys.Count -gt 0) { + Log-Success "Quota data retrieved for $($quotaData.Keys.Count) VM families" +} +else { + Log-Warning "Could not retrieve VM quota data for '$($env:AZURE_LOCATION)'. Menus will show all SKUs without quota filtering." + $quotaData = @{} +} -# Resolve defaults — if the configured default isn't available, pick the closest match -$currentSystemVm = Resolve-DefaultSku -DefaultSku $currentSystemVm -AvailableSizes $systemVmSizes -PreferCores 4 -$currentWorkloadVm = Resolve-DefaultSku -DefaultSku $currentWorkloadVm -AvailableSizes $workloadVmSizes -PreferCores 32 -$currentDeepstreamVm = Resolve-DefaultSku -DefaultSku $currentDeepstreamVm -AvailableSizes $gpuVmSizes -PreferCores 24 -$currentInferenceVm = Resolve-DefaultSku -DefaultSku $currentInferenceVm -AvailableSizes $gpuVmSizes -PreferCores 24 +# Pick recommended VM sizes per pool (4 families x 3 sizes = ~12 items each). +# QuotaData filters out SKUs whose family can't satisfy the pool's max node count. +$systemVmSizes = Select-VmSizesForMenu -AllSizes $allVmSizes -Prefixes $CPU_VM_PREFIXES -RecommendedFamilies $SYSTEM_RECOMMENDED_FAMILIES -DefaultSku $currentSystemVm -SizesPerFamily $SIZES_PER_FAMILY -MaxCores $SYSTEM_MAX_CORES -QuotaData $quotaData -MaxNodes $SYSTEM_MAX_NODE_COUNT +$workloadVmSizes = Select-VmSizesForMenu -AllSizes $allVmSizes -Prefixes $CPU_VM_PREFIXES -RecommendedFamilies $WORKLOAD_RECOMMENDED_FAMILIES -DefaultSku $currentWorkloadVm -SizesPerFamily $SIZES_PER_FAMILY -MinCores $WORKLOAD_MIN_CORES -QuotaData $quotaData -MaxNodes $WORKLOAD_MAX_NODE_COUNT +$deepstreamVmSizes = Select-VmSizesForMenu -AllSizes $allVmSizes -Prefixes $GPU_VM_PREFIXES -RecommendedFamilies $GPU_RECOMMENDED_FAMILIES -DefaultSku $currentDeepstreamVm -SizesPerFamily $SIZES_PER_FAMILY -QuotaData $quotaData -MaxNodes $DEEPSTREAM_GPU_MAX_NODE_COUNT +$inferenceVmSizes = Select-VmSizesForMenu -AllSizes $allVmSizes -Prefixes $GPU_VM_PREFIXES -RecommendedFamilies $GPU_RECOMMENDED_FAMILIES -DefaultSku $currentInferenceVm -SizesPerFamily $SIZES_PER_FAMILY -QuotaData $quotaData -MaxNodes $INFERENCE_GPU_MAX_NODE_COUNT -Write-KeyValue "System pool" "$($systemVmSizes.Count) sizes" -Write-KeyValue "Workload pool" "$($workloadVmSizes.Count) sizes" -Write-KeyValue "GPU pool" "$($gpuVmSizes.Count) sizes" +# Resolve defaults — if the configured default isn't available, pick the closest match +$currentSystemVm = Resolve-DefaultSku -DefaultSku $currentSystemVm -AvailableSizes $systemVmSizes -PreferCores 4 +$currentWorkloadVm = Resolve-DefaultSku -DefaultSku $currentWorkloadVm -AvailableSizes $workloadVmSizes -PreferCores 32 +$currentDeepstreamVm = Resolve-DefaultSku -DefaultSku $currentDeepstreamVm -AvailableSizes $deepstreamVmSizes -PreferCores 24 +$currentInferenceVm = Resolve-DefaultSku -DefaultSku $currentInferenceVm -AvailableSizes $inferenceVmSizes -PreferCores 24 -Log-Info "Querying GPU quota..." -$quotaData = Get-AzVmQuotaForRegion -Location $env:AZURE_LOCATION -Log-Success "GPU quota data retrieved" +Write-KeyValue "System pool" "$($systemVmSizes.Count) sizes (with quota)" +Write-KeyValue "Workload pool" "$($workloadVmSizes.Count) sizes (with quota)" +Write-KeyValue "Deepstream pool" "$($deepstreamVmSizes.Count) sizes (with quota)" +Write-KeyValue "Inference pool" "$($inferenceVmSizes.Count) sizes (with quota)" Write-Section "Choose a VM SKU for each AKS node pool" Log-Info "The default is highlighted. Press Enter to accept, C for custom." -# CPU pools (separate lists for system vs workload) -$selectedSystem = Show-VmSelectionMenu -PoolName "System (CPU)" -EnvVarName "SYSTEM_VM_SIZE" -VmSizes $systemVmSizes -DefaultSku $currentSystemVm -Location $env:AZURE_LOCATION -$selectedWorkload = Show-VmSelectionMenu -PoolName "Workload (CPU)" -EnvVarName "WORKLOAD_VM_SIZE" -VmSizes $workloadVmSizes -DefaultSku $currentWorkloadVm -Location $env:AZURE_LOCATION +# CPU pools (quota-filtered lists, node count determines total cores checked) +$selectedSystem = Show-VmSelectionMenu -PoolName "System (CPU)" -EnvVarName "SYSTEM_VM_SIZE" -VmSizes $systemVmSizes -DefaultSku $currentSystemVm -Location $env:AZURE_LOCATION -MaxNodes $SYSTEM_MAX_NODE_COUNT -QuotaData $quotaData +$selectedWorkload = Show-VmSelectionMenu -PoolName "Workload (CPU)" -EnvVarName "WORKLOAD_VM_SIZE" -VmSizes $workloadVmSizes -DefaultSku $currentWorkloadVm -Location $env:AZURE_LOCATION -MaxNodes $WORKLOAD_MAX_NODE_COUNT -QuotaData $quotaData # GPU pools (quota validated inline, node count determines total cores checked) -$selectedDeepstream = Show-VmSelectionMenu -PoolName "Deepstream (GPU)" -EnvVarName "DEEPSTREAM_GPU_VM_SIZE" -VmSizes $gpuVmSizes -DefaultSku $currentDeepstreamVm -Location $env:AZURE_LOCATION -IsGpu -MaxNodes $DEEPSTREAM_GPU_MAX_NODE_COUNT -QuotaData $quotaData -$selectedInference = Show-VmSelectionMenu -PoolName "Inference (GPU)" -EnvVarName "INFERENCE_GPU_VM_SIZE" -VmSizes $gpuVmSizes -DefaultSku $currentInferenceVm -Location $env:AZURE_LOCATION -IsGpu -MaxNodes $INFERENCE_GPU_MAX_NODE_COUNT -QuotaData $quotaData +$selectedDeepstream = Show-VmSelectionMenu -PoolName "Deepstream (GPU)" -EnvVarName "DEEPSTREAM_GPU_VM_SIZE" -VmSizes $deepstreamVmSizes -DefaultSku $currentDeepstreamVm -Location $env:AZURE_LOCATION -IsGpu -MaxNodes $DEEPSTREAM_GPU_MAX_NODE_COUNT -QuotaData $quotaData +$selectedInference = Show-VmSelectionMenu -PoolName "Inference (GPU)" -EnvVarName "INFERENCE_GPU_VM_SIZE" -VmSizes $inferenceVmSizes -DefaultSku $currentInferenceVm -Location $env:AZURE_LOCATION -IsGpu -MaxNodes $INFERENCE_GPU_MAX_NODE_COUNT -QuotaData $quotaData # Update script-scope variables for downstream consumers $DEEPSTREAM_GPU_VM_SIZE = $selectedDeepstream.Sku diff --git a/hooks/preprovision.sh b/hooks/preprovision.sh index fe4a79d..7571524 100755 --- a/hooks/preprovision.sh +++ b/hooks/preprovision.sh @@ -4,8 +4,7 @@ # Pre-Provision Script: Validate prerequisites before provisioning # ============================================================================= -set -e - +set -eo pipefail source "$(dirname "$0")/common.sh" TOTAL_STEPS=6 @@ -17,14 +16,14 @@ write_foundry_banner "Pre-Provision Validation" # ===================================================== log_step 1 $TOTAL_STEPS "Checking Azure CLI Authentication" -ACCOUNT_INFO=$(az account show --query "{name:name, id:id}" -o tsv 2>/dev/null || true) +ACCOUNT_INFO=$(az account show --query "{name:name, id:id}" -o tsv 2>/dev/null | tr -d '\r' || true) if [ -z "$ACCOUNT_INFO" ]; then log_error "Not logged in to Azure CLI. Run 'az login' before provisioning." exit 1 fi -ACCOUNT_NAME=$(az account show --query "name" -o tsv 2>/dev/null) +ACCOUNT_NAME=$(az account show --query "name" -o tsv 2>/dev/null | tr -d '\r') log_success "Signed in to account: ${ACCOUNT_NAME}" if [ -n "${AZURE_SUBSCRIPTION_ID:-}" ]; then @@ -33,9 +32,9 @@ if [ -n "${AZURE_SUBSCRIPTION_ID:-}" ]; then log_info "Verify the subscription ID and your access permissions." exit 1 } - SUB_NAME=$(az account show --query "name" -o tsv 2>/dev/null) + SUB_NAME=$(az account show --query "name" -o tsv 2>/dev/null | tr -d '\r') log_success "Subscription: ${SUB_NAME}" - write_key_value "ID" "$AZURE_SUBSCRIPTION_ID" + log_success "ID ${AZURE_SUBSCRIPTION_ID}" fi # ===================================================== @@ -51,7 +50,7 @@ assert_cli_tools az helm kubectl -- kubelogin jq log_step 3 $TOTAL_STEPS "Validating Environment Variables" for var in AZURE_SUBSCRIPTION_ID AZURE_LOCATION AZURE_ENV_NAME; do - eval val=\$$var 2>/dev/null || val="" + val="${!var:-}" if [ -z "$val" ]; then write_health_row "$var" "Fail" "not set" else @@ -65,11 +64,11 @@ assert_env_vars AZURE_SUBSCRIPTION_ID AZURE_LOCATION AZURE_ENV_NAME STORAGE_SKU="${STORAGE_SKU_NAME:-Standard_LRS}" if [ "$STORAGE_SKU" = "Standard_ZRS" ]; then log_info "Checking ZRS availability in ${AZURE_LOCATION}..." - ZRS_AVAILABLE=$(az provider show --namespace Microsoft.Storage \ - --query "resourceTypes[?resourceType=='storageAccounts'].zoneMappings[?contains(location, '${AZURE_LOCATION}')].location | [0][0]" \ - -o tsv 2>/dev/null || true) + ZRS_AVAILABLE=$(az storage account list-skus --location "$AZURE_LOCATION" \ + --query "[?name=='Standard_ZRS'].name | [0]" \ + -o tsv 2>/dev/null | tr -d '\r' || true) if [ -z "$ZRS_AVAILABLE" ]; then - log_warning "Standard_ZRS may not be available in '${AZURE_LOCATION}'." + log_warning "Requested Standard_ZRS is not available in '${AZURE_LOCATION}'." log_warning "Falling back to Standard_LRS to avoid deployment failure." azd env set STORAGE_SKU_NAME Standard_LRS 2>/dev/null || true STORAGE_SKU="Standard_LRS" @@ -77,7 +76,42 @@ if [ "$STORAGE_SKU" = "Standard_ZRS" ]; then log_success "ZRS is available in ${AZURE_LOCATION}" fi fi -write_key_value "Storage SKU" "$STORAGE_SKU" +write_key_value "STORAGE_SKU" "$STORAGE_SKU" + +# ── Validate Kubernetes version against region capabilities ────────────── +# Pin minors drift from standard support to LTS-only (e.g. 1.32 → LTS). +# If the requested minor is not available on the standard "KubernetesOfficial" +# support plan in this region, auto-fall back to the region's default minor. +REQUESTED_K8S="${KUBERNETES_VERSION:-1.34}" +K8S_VERSIONS_JSON=$(az aks get-versions --location "$AZURE_LOCATION" -o json 2>/dev/null || true) +if [ -n "$K8S_VERSIONS_JSON" ] && command -v jq >/dev/null 2>&1; then + K8S_STANDARD=$(echo "$K8S_VERSIONS_JSON" | jq -r '.values[] | select(.capabilities.supportPlan | contains(["KubernetesOfficial"])) | .version' 2>/dev/null || true) + K8S_DEFAULT=$(echo "$K8S_VERSIONS_JSON" | jq -r '.values[] | select(.isDefault==true) | .version' 2>/dev/null | head -n1) + if ! echo "$K8S_STANDARD" | grep -qx "$REQUESTED_K8S"; then + if [ -n "$K8S_DEFAULT" ]; then + log_warning "Kubernetes '${REQUESTED_K8S}' is not on standard support in '${AZURE_LOCATION}'." + log_warning "Falling back to region default: '${K8S_DEFAULT}'." + azd_env_set KUBERNETES_VERSION "$K8S_DEFAULT" || true + export KUBERNETES_VERSION="$K8S_DEFAULT" + REQUESTED_K8S="$K8S_DEFAULT" + else + log_warning "Kubernetes '${REQUESTED_K8S}' is not on standard support and no default could be resolved." + fi + fi + write_key_value "KUBERNETES_VERSION" "$REQUESTED_K8S" +else + log_warning "Could not query AKS versions in '${AZURE_LOCATION}'. Proceeding with '${REQUESTED_K8S}'." +fi + +# Persist the resolved CREATE_FOUNDRY_PROJECT value so Bicep + all downstream +# hooks agree on the same boolean. (Bicep's default is true; the hooks default +# to true to match.) +if azd env set CREATE_FOUNDRY_PROJECT "$CREATE_FOUNDRY_PROJECT" 2>/dev/null; then + export CREATE_FOUNDRY_PROJECT + write_key_value "CREATE_FOUNDRY_PROJECT" $CREATE_FOUNDRY_PROJECT +else + log_warning "Could not persist CREATE_FOUNDRY_PROJECT via 'azd env set'." +fi # ===================================================== # Step 4: Resolve AI Model Quota (if Foundry enabled) @@ -113,35 +147,74 @@ log_success "Found VM SKUs in ${AZURE_LOCATION}" log_info "${#ALL_CPU_VMS[@]} CPU + ${#ALL_GPU_VMS[@]} GPU sizes available" +if [ "${#ALL_CPU_VMS[@]}" -eq 0 ] && [ "${#ALL_GPU_VMS[@]}" -eq 0 ]; then + log_error "No VM sizes are available in '${AZURE_LOCATION}' for this subscription." + log_info "This usually means the region has restrictive SKU policies for your subscription." + log_info "Try a different region: run 'azd env set AZURE_LOCATION ' and re-run 'azd up'." + log_info "Recommended regions: eastus2, westus3, southcentralus, northeurope" + exit 1 +fi + +if [ "${#ALL_CPU_VMS[@]}" -eq 0 ]; then + log_error "No CPU VM sizes (D/E/F-series) are available in '${AZURE_LOCATION}'." + log_info "AKS requires CPU nodes for system and workload pools." + log_info "Try a different region: run 'azd env set AZURE_LOCATION ' and re-run 'azd up'." + log_info "Recommended regions: eastus2, westus3, southcentralus, northeurope" + exit 1 +fi + +if [ "${#ALL_GPU_VMS[@]}" -eq 0 ]; then + log_warning "No GPU VM sizes (NC/NV/ND-series) are available in '${AZURE_LOCATION}'." + log_info "GPU pools are required for video processing. Consider a region with GPU support." + log_info "Try: run 'azd env set AZURE_LOCATION ' and re-run 'azd up'." + log_info "Recommended GPU regions: eastus2, westus3, southcentralus, northeurope" + exit 1 +fi + +log_info "Querying VM quota in region..." +if fetch_vm_quota_map "$AZURE_LOCATION"; then + log_success "Quota data retrieved for ${VM_QUOTA_MAP_COUNT} VM families" +else + log_warning "Could not retrieve VM quota data; menus will show all SKUs." +fi + select_vm_sizes_for_menu ALL_CPU_VMS SYSTEM_VMS SYSTEM_RECOMMENDED_FAMILIES "$CURRENT_SYSTEM_VM" "$SIZES_PER_FAMILY" 0 "$SYSTEM_MAX_CORES" select_vm_sizes_for_menu ALL_CPU_VMS WORKLOAD_VMS WORKLOAD_RECOMMENDED_FAMILIES "$CURRENT_WORKLOAD_VM" "$SIZES_PER_FAMILY" "$WORKLOAD_MIN_CORES" -select_vm_sizes_for_menu ALL_GPU_VMS GPU_VMS GPU_RECOMMENDED_FAMILIES "$CURRENT_DEEPSTREAM_VM" "$SIZES_PER_FAMILY" +select_vm_sizes_for_menu ALL_GPU_VMS DEEPSTREAM_VMS GPU_RECOMMENDED_FAMILIES "$CURRENT_DEEPSTREAM_VM" "$SIZES_PER_FAMILY" +select_vm_sizes_for_menu ALL_GPU_VMS INFERENCE_VMS GPU_RECOMMENDED_FAMILIES "$CURRENT_INFERENCE_VM" "$SIZES_PER_FAMILY" + +# Annotate + filter by quota using each pool's max node count. +annotate_vm_sizes_with_quota SYSTEM_VMS "$CURRENT_SYSTEM_VM" "$SYSTEM_MAX_NODE_COUNT" +annotate_vm_sizes_with_quota WORKLOAD_VMS "$CURRENT_WORKLOAD_VM" "$WORKLOAD_MAX_NODE_COUNT" +annotate_vm_sizes_with_quota DEEPSTREAM_VMS "$CURRENT_DEEPSTREAM_VM" "$DEEPSTREAM_GPU_MAX_NODE_COUNT" +annotate_vm_sizes_with_quota INFERENCE_VMS "$CURRENT_INFERENCE_VM" "$INFERENCE_GPU_MAX_NODE_COUNT" # Resolve defaults — if the configured default isn't available, pick the closest match CURRENT_SYSTEM_VM=$(resolve_default_sku "$CURRENT_SYSTEM_VM" SYSTEM_VMS 4) CURRENT_WORKLOAD_VM=$(resolve_default_sku "$CURRENT_WORKLOAD_VM" WORKLOAD_VMS 32) -CURRENT_DEEPSTREAM_VM=$(resolve_default_sku "$CURRENT_DEEPSTREAM_VM" GPU_VMS 24) -CURRENT_INFERENCE_VM=$(resolve_default_sku "$CURRENT_INFERENCE_VM" GPU_VMS 24) +CURRENT_DEEPSTREAM_VM=$(resolve_default_sku "$CURRENT_DEEPSTREAM_VM" DEEPSTREAM_VMS 24) +CURRENT_INFERENCE_VM=$(resolve_default_sku "$CURRENT_INFERENCE_VM" INFERENCE_VMS 24) -write_key_value "System pool" "${#SYSTEM_VMS[@]} sizes" -write_key_value "Workload pool" "${#WORKLOAD_VMS[@]} sizes" -write_key_value "GPU pool" "${#GPU_VMS[@]} sizes" +write_key_value "System pool" "${#SYSTEM_VMS[@]} sizes (with quota)" +write_key_value "Workload pool" "${#WORKLOAD_VMS[@]} sizes (with quota)" +write_key_value "Deepstream pool" "${#DEEPSTREAM_VMS[@]} sizes (with quota)" +write_key_value "Inference pool" "${#INFERENCE_VMS[@]} sizes (with quota)" write_section "Choose a VM SKU for each AKS node pool" log_info "The default is highlighted. Press Enter to accept, C for custom." -# CPU pools (separate lists for system vs workload) -show_vm_selection_menu "System (CPU)" "SYSTEM_VM_SIZE" SYSTEM_VMS "$CURRENT_SYSTEM_VM" "$AZURE_LOCATION" +# CPU pools (quota-filtered lists, node count determines total cores checked) +show_vm_selection_menu "System (CPU)" "SYSTEM_VM_SIZE" SYSTEM_VMS "$CURRENT_SYSTEM_VM" "$AZURE_LOCATION" "$SYSTEM_MAX_NODE_COUNT" SYSTEM_SKU="$SELECTED_VM_SKU" -show_vm_selection_menu "Workload (CPU)" "WORKLOAD_VM_SIZE" WORKLOAD_VMS "$CURRENT_WORKLOAD_VM" "$AZURE_LOCATION" +show_vm_selection_menu "Workload (CPU)" "WORKLOAD_VM_SIZE" WORKLOAD_VMS "$CURRENT_WORKLOAD_VM" "$AZURE_LOCATION" "$WORKLOAD_MAX_NODE_COUNT" WORKLOAD_SKU="$SELECTED_VM_SKU" # GPU pools (quota validated inline, node count determines total cores checked) -show_vm_selection_menu "Deepstream (GPU)" "DEEPSTREAM_GPU_VM_SIZE" GPU_VMS "$CURRENT_DEEPSTREAM_VM" "$AZURE_LOCATION" "$DEEPSTREAM_GPU_MAX_NODE_COUNT" "gpu" +show_vm_selection_menu "Deepstream (GPU)" "DEEPSTREAM_GPU_VM_SIZE" DEEPSTREAM_VMS "$CURRENT_DEEPSTREAM_VM" "$AZURE_LOCATION" "$DEEPSTREAM_GPU_MAX_NODE_COUNT" "gpu" DEEPSTREAM_GPU_VM_SIZE="$SELECTED_VM_SKU" -show_vm_selection_menu "Inference (GPU)" "INFERENCE_GPU_VM_SIZE" GPU_VMS "$CURRENT_INFERENCE_VM" "$AZURE_LOCATION" "$INFERENCE_GPU_MAX_NODE_COUNT" "gpu" +show_vm_selection_menu "Inference (GPU)" "INFERENCE_GPU_VM_SIZE" INFERENCE_VMS "$CURRENT_INFERENCE_VM" "$AZURE_LOCATION" "$INFERENCE_GPU_MAX_NODE_COUNT" "gpu" INFERENCE_GPU_VM_SIZE="$SELECTED_VM_SKU" # ===================================================== diff --git a/hooks/ui.sh b/hooks/ui.sh index 250bfcb..456377d 100755 --- a/hooks/ui.sh +++ b/hooks/ui.sh @@ -109,6 +109,12 @@ log_info() { _write_log_message "$1" "$SYM_INFO" "$C_ACCENT" } +# Character count (not byte count) — safe for UTF-8 glyphs like symbols. +# Use this instead of ${#var} when computing display widths. +_str_len() { + printf '%s' "$1" | wc -m | tr -d ' ' +} + log_success() { _write_log_message "$1" "$SYM_SUCCESS" "$C_SUCCESS" "$C_SUCCESS" } @@ -128,7 +134,7 @@ log_step() { # Divider length matches the title line above (min 40) local header_text="${SYM_STEP} [${number}/${total}] ${title}" - local divider_len=${#header_text} + local divider_len; divider_len=$(_str_len "$header_text") [ "$divider_len" -lt 40 ] && divider_len=40 local rule="" @@ -169,7 +175,7 @@ write_box_banner() { local spaces; spaces=$(_repeat_char " " "$inner_width") # Centered title - local text_len=${#text} + local text_len; text_len=$(_str_len "$text") local left_pad=$(( (inner_width - text_len) / 2 )) local right_pad=$(( inner_width - text_len - left_pad )) local left_spaces; left_spaces=$(_repeat_char " " "$left_pad") @@ -182,7 +188,7 @@ write_box_banner() { "$color" "$right_spaces" "$v" "$C_RESET" if [ -n "$subtitle" ]; then - local sub_len=${#subtitle} + local sub_len; sub_len=$(_str_len "$subtitle") local sub_left=$(( (inner_width - sub_len) / 2 )) local sub_right=$(( inner_width - sub_len - sub_left )) local sub_left_sp; sub_left_sp=$(_repeat_char " " "$sub_left") @@ -231,7 +237,7 @@ write_foundry_banner() { write_title() { local text="$1" - local text_len=${#text} + local text_len; text_len=$(_str_len "$text") local rule; rule=$(_repeat_char "$SYM_HLINE" "$((text_len + 4))") printf "\n %b%s%b\n" "$C_BOLD_WHITE" "$text" "$C_RESET" printf " %b%s%b\n" "$C_ACCENT_DIM" "$rule" "$C_RESET" diff --git a/hooks/validate-env.ps1 b/hooks/validate-env.ps1 index 31ed873..6ce57b5 100644 --- a/hooks/validate-env.ps1 +++ b/hooks/validate-env.ps1 @@ -36,7 +36,7 @@ Test-EnvVarFormat -VarName "AZURE_SUBSCRIPTION_ID" -Pattern $uuidPattern -Exampl Test-EnvVarFormat -VarName "AZURE_PRINCIPAL_ID" -Pattern $uuidPattern -Example "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" # ── VM sizes must start with Standard_ ──────────────────────────────────── -$vmPattern = '^Standard_\w+' +$vmPattern = '^Standard_[A-Za-z0-9_]+$' Test-EnvVarFormat -VarName "SYSTEM_VM_SIZE" -Pattern $vmPattern -Example "Standard_D4a_v4" Test-EnvVarFormat -VarName "WORKLOAD_VM_SIZE" -Pattern $vmPattern -Example "Standard_D32a_v4" Test-EnvVarFormat -VarName "DEEPSTREAM_GPU_VM_SIZE" -Pattern $vmPattern -Example "Standard_NC24ads_A100_v4" @@ -53,7 +53,7 @@ $skuPattern = '^Standard_(LRS|ZRS|GRS)$' Test-EnvVarFormat -VarName "STORAGE_SKU_NAME" -Pattern $skuPattern -Example "Standard_LRS, Standard_ZRS, or Standard_GRS" # ── Location must be non-empty if set ────────────────────────────────────── -$locationPattern = '^[a-z0-9]+$' +$locationPattern = '^[a-z0-9]+(-[a-z0-9]+)*$' Test-EnvVarFormat -VarName "AZURE_LOCATION" -Pattern $locationPattern -Example "eastus2" if ($hasError) { diff --git a/hooks/validate-env.sh b/hooks/validate-env.sh index cefe800..d86680a 100755 --- a/hooks/validate-env.sh +++ b/hooks/validate-env.sh @@ -8,8 +8,7 @@ # Pattern adapted from get-started-with-ai-agents/scripts/validate_env_vars.sh # ============================================================================= -set -e - +set -eo pipefail has_error=false validate_env_var() { @@ -38,7 +37,7 @@ validate_env_var "AZURE_SUBSCRIPTION_ID" "$uuid_pattern" "xxxxxxxx-xxxx-xxxx-xxx validate_env_var "AZURE_PRINCIPAL_ID" "$uuid_pattern" "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" # ── VM sizes must start with Standard_ ──────────────────────────────────── -vm_pattern='^Standard_' +vm_pattern='^Standard_[A-Za-z0-9_]+$' validate_env_var "SYSTEM_VM_SIZE" "$vm_pattern" "Standard_D4a_v4" validate_env_var "WORKLOAD_VM_SIZE" "$vm_pattern" "Standard_D32a_v4" validate_env_var "DEEPSTREAM_GPU_VM_SIZE" "$vm_pattern" "Standard_NC24ads_A100_v4" @@ -55,7 +54,7 @@ sku_pattern='^Standard_(LRS|ZRS|GRS)$' validate_env_var "STORAGE_SKU_NAME" "$sku_pattern" "Standard_LRS, Standard_ZRS, or Standard_GRS" # ── Location must be non-empty lowercase alphanumeric if set ─────────────── -location_pattern='^[a-z0-9]+$' +location_pattern='^[a-z0-9]+(-[a-z0-9]+)*$' validate_env_var "AZURE_LOCATION" "$location_pattern" "eastus2" if [ "$has_error" = true ]; then diff --git a/infra/main.bicep b/infra/main.bicep index f5e10d0..0018425 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -23,7 +23,7 @@ param principalId string = '' param createRoleForUser bool @description('Whether to create a Foundry project and link it to the VI extension') -param createFoundryProject bool +param createFoundryProject bool = true @description('Model name to deploy in AI Foundry') param aiModelName string diff --git a/infra/main.parameters.json b/infra/main.parameters.json index 3372c50..073a886 100644 --- a/infra/main.parameters.json +++ b/infra/main.parameters.json @@ -18,7 +18,7 @@ "value": "${CREATE_ROLE_FOR_USER=true}" }, "kubernetesVersion": { - "value": "${KUBERNETES_VERSION=1.32}" + "value": "${KUBERNETES_VERSION=1.34}" }, "systemVmSize": { "value": "${SYSTEM_VM_SIZE=Standard_D4a_v4}" diff --git a/infra/modules/aks.bicep b/infra/modules/aks.bicep index cff9534..f47e71b 100644 --- a/infra/modules/aks.bicep +++ b/infra/modules/aks.bicep @@ -22,10 +22,10 @@ param deepstreamGpuVmSize string @description('VM size for the GPU inference workload node pool') param inferenceGpuVmSize string -@description('Maximum number of system nodes') +@description('Number of nodes in the system node pool (fixed count — not autoscaled)') @minValue(1) @maxValue(10) -param systemMaxNodeCount int = 2 +param systemNodeCount int = 2 @description('Maximum number of workload nodes') @minValue(1) @@ -48,10 +48,10 @@ param dnsPrefix string = name @description('Node resource group name') param nodeResourceGroup string -@description('Node label value used to target deepstream workloads') +// Node label value used to target deepstream workloads var deepstreamWorkloadLabel = 'deepstream' -@description('Node label value used to target inference workloads') +// Node label value used to target inference workloads var inferenceWorkloadLabel = 'inference' resource aksCluster 'Microsoft.ContainerService/managedClusters@2024-09-01' = { @@ -111,7 +111,7 @@ resource aksCluster 'Microsoft.ContainerService/managedClusters@2024-09-01' = { agentPoolProfiles: [ { name: 'system' - count: systemMaxNodeCount + count: systemNodeCount vmSize: systemVmSize osType: 'Linux' osSKU: 'AzureLinux'