From e7b4f2a58bc31c863e6d856d448c3ce946757df6 Mon Sep 17 00:00:00 2001 From: cameronmeissner Date: Tue, 30 Jun 2026 15:17:30 -0700 Subject: [PATCH 1/7] feat(linux): reduce prefetch optimization runtime within VHD builds --- vhdbuilder/prefetch/scripts/optimize.sh | 144 +++++++++++++++----- vhdbuilder/prefetch/templates/optimize.json | 7 +- 2 files changed, 111 insertions(+), 40 deletions(-) diff --git a/vhdbuilder/prefetch/scripts/optimize.sh b/vhdbuilder/prefetch/scripts/optimize.sh index c6c1433a050..25706588f7c 100755 --- a/vhdbuilder/prefetch/scripts/optimize.sh +++ b/vhdbuilder/prefetch/scripts/optimize.sh @@ -30,6 +30,15 @@ IMAGE_BUILDER_TEMPLATE_NAME="template-${CAPTURED_SIG_VERSION}-${BUILD_RUN_NUMBER IMAGE_BUILDER_TEMPLATE_NAME="${IMAGE_BUILDER_TEMPLATE_NAME:0:64}" VHD_URI="${STORAGE_ACCOUNT_BLOB_URL}/${VHD_NAME}" +# the image builder template distributes the prefetch-optimized image as a managed image into the +# temporary image builder resource group. we then convert that managed image into a VHD blob within +# the target storage account ourselves, which avoids the slow multi-copy VHD distribution performed +# by image builder when distributing directly to a VHD blob. +DISTRIBUTE_MANAGED_IMAGE_NAME="${CAPTURED_SIG_VERSION}-optimized" +DISTRIBUTE_MANAGED_IMAGE_ID="/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${IMAGE_BUILDER_RG_NAME}/providers/Microsoft.Compute/images/${DISTRIBUTE_MANAGED_IMAGE_NAME}" +OPTIMIZED_DISK_NAME="${CAPTURED_SIG_VERSION}-optimized" +OPTIMIZED_DISK_ID="/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${IMAGE_BUILDER_RG_NAME}/providers/Microsoft.Compute/disks/${OPTIMIZED_DISK_NAME}" + main() { # for idempotency, check to see if the VHD we're trying to create already exists. # if it's already in the expected state, this will cause the script to exit early. @@ -37,41 +46,31 @@ main() { # optimization + conversion flow check_for_existing_vhd - # attempt to perform prefetch optimization and VHD conversion + # attempt to perform prefetch optimization, distributing the optimized image as a managed image, + # then convert that managed image into a VHD blob within the target storage account ensure_image_builder_rg || exit $? run_image_builder_template || exit $? + convert_managed_image_to_vhd || exit $? } check_for_existing_vhd() { vhd_info="$(az storage blob show --blob-url "${VHD_URI}" --auth-mode login)" - if [ -n "${vhd_info}" ]; then - echo "VHD already exists at: ${VHD_URI}" - image_builder_source="$(jq -r '.metadata.VMImageBuilderSource' <<< "${vhd_info}")" - if [ -n "${image_builder_source}" ] && [ "${image_builder_source}" != "null" ]; then - echo "VHD ${VHD_URI} has already been produced by a previous image builder template run" - copy_status="$(jq -r '.properties.copy.status' <<< "${vhd_info}")" - if [ "${copy_status,,}" = "success" ]; then - echo "VHD ${VHD_URI} has been successfully copied from image builder storage, nothing to do" - exit 0 - fi - if [ "${copy_status,,}" = "pending" ]; then - echo "echo VHD ${VHD_URI} is currently being copied from image builder storage, will wait for copy completion" - if wait_for_vhd_copy; then - exit 0 - fi - echo "pending copy operation was not successful, will delete existing blob and attempt to retry optimization and VHD creation" - delete_vhd || exit $? - else - echo "VHD ${VHD_URI} has a bad copy state: ${copy_status}, will delete it and recreate" - delete_vhd || exit $? - fi - else - echo "VHD ${VHD_URI} exists but was not produced by an image builder template run, will delete before proceeding" - delete_vhd || exit $? - fi - else + if [ -z "${vhd_info}" ]; then echo "no existing VHD was found at: ${VHD_URI}, will proceed with optimization and VHD creation" + return 0 fi + echo "VHD already exists at: ${VHD_URI}" + + # we mark the VHD with the prefetchOptimized metadata flag only after the optimized managed image + # has been fully copied into the target storage account. if the flag is present, the VHD is complete + # and there is nothing left to do. otherwise, the blob is left over from an incomplete run and must + # be deleted before retrying the optimization + conversion flow. + if [ "$(jq -r '.metadata.prefetchOptimized' <<< "${vhd_info}")" = "true" ]; then + echo "VHD ${VHD_URI} was fully produced by a previous prefetch optimization run, nothing to do" + exit 0 + fi + echo "VHD ${VHD_URI} exists but was not produced by a prefetch optimization run, will delete before proceeding" + delete_vhd || exit $? } ensure_image_builder_rg() { @@ -92,7 +91,7 @@ run_image_builder_template() { -e "s##${SOURCE_TYPE}#g" \ -e "s##${SOURCE_ID_KEY}#g" \ -e "s##${SOURCE_ID}#g" \ - -e "s##${VHD_URI}#g" \ + -e "s##${DISTRIBUTE_MANAGED_IMAGE_ID}#g" \ "${IMAGE_BUILDER_TEMPLATE_PATH}" > input.json || return $? if [ ! -f "input.json" ]; then @@ -126,7 +125,7 @@ run_image_builder_template() { return 1 fi - echo "template ${IMAGE_BUILDER_TEMPLATE_NAME} has ran to completion, VHD has been published to: ${VHD_URI}" + echo "template ${IMAGE_BUILDER_TEMPLATE_NAME} has ran to completion, optimized managed image has been published to: ${DISTRIBUTE_MANAGED_IMAGE_ID}" } need_new_template() { @@ -275,18 +274,89 @@ create_temp_storage() { TEMP_VHD_URI="${temp_vhd_uri}" } -wait_for_vhd_copy() { - copy_status="$(az storage blob show --blob-url "${VHD_URI}" --auth-mode login | jq -r '.properties.copy.status')" - while [ "${copy_status,,}" = "pending" ]; do - echo "VHD ${VHD_URI} is still undergoing a pending copy operation" +wait_for_managed_image() { + # the managed image distributed by the image builder template should exist once the run succeeds, + # but the underlying ARM resource can take a brief moment to become visible after the run completes. + managed_image_id="" + attempts=0 + while [ "${attempts}" -lt 10 ]; do + managed_image_id="$(az image show -g "${IMAGE_BUILDER_RG_NAME}" -n "${DISTRIBUTE_MANAGED_IMAGE_NAME}" 2>/dev/null | jq -r '.id')" + if [ -n "${managed_image_id}" ] && [ "${managed_image_id,,}" != "null" ]; then + echo "found optimized managed image: ${managed_image_id}" + return 0 + fi + echo "optimized managed image ${DISTRIBUTE_MANAGED_IMAGE_NAME} not yet visible in ${IMAGE_BUILDER_RG_NAME}, will wait 30s before checking again" sleep 30s - copy_status="$(az storage blob show --blob-url "${VHD_URI}" --auth-mode login | jq -r '.properties.copy.status')" + attempts=$((attempts + 1)) done - if [ "${copy_status,,}" != "success" ]; then - echo "VHD copy over ${VHD_URI} finished with unexpected status: ${copy_status}" + echo "expected optimized managed image ${DISTRIBUTE_MANAGED_IMAGE_NAME} does not exist in ${IMAGE_BUILDER_RG_NAME}" + return 1 +} + +# convert_managed_image_to_vhd converts the prefetch-optimized managed image produced by the image +# builder template into a VHD blob within the target storage account. This replaces distributing a +# VHD directly from the image builder template, which performed multiple slow blob copies. The steps: +# 1. Create a managed disk in the image builder resource group from the optimized managed image +# 2. Grant temporary read (SAS) access to the disk +# 3. azcopy the disk directly into the target VHD blob in a single copy operation +# 4. Mark the VHD blob complete so retries of this step are idempotent +convert_managed_image_to_vhd() { + wait_for_managed_image || return $? + + if [ -z "$(az disk show --ids "${OPTIMIZED_DISK_ID}" | jq -r '.id')" ]; then + # CVM and TrustedLaunch images must carry their security type onto the managed disk, otherwise + # the disk cannot be created from the optimized managed image. Standard images get no security profile. + security_profile="" + if [ "${ENABLE_TRUSTED_LAUNCH,,}" = "true" ]; then + echo "optimized image is a TrustedLaunch flavor, will create managed disk with TrustedLaunch security type" + security_profile="\"securityProfile\": { \"securityType\": \"TrustedLaunch\" }, " + elif grep -q "cvm" <<< "$FEATURE_FLAGS"; then + echo "optimized image is a CVM flavor, will create managed disk with ConfidentialVM security type" + security_profile="\"securityProfile\": { \"securityType\": \"ConfidentialVM_VMGuestStateOnlyEncryptedWithPlatformKey\" }, " + fi + echo "creating managed disk ${OPTIMIZED_DISK_NAME} from optimized managed image ${DISTRIBUTE_MANAGED_IMAGE_ID}" + az resource create --id "${OPTIMIZED_DISK_ID}" --api-version "${MANAGED_DISK_API_VERSION}" --is-full-object --location "$LOCATION" --properties "{\"location\": \"$LOCATION\", \ + \"properties\": { \ + \"osType\": \"Linux\", \ + ${security_profile}\ + \"creationData\": { \ + \"createOption\": \"FromImage\", \ + \"imageReference\": { \ + \"id\": \"${DISTRIBUTE_MANAGED_IMAGE_ID}\" \ + } \ + } \ + } \ + }" || return $? + echo "created managed disk ${OPTIMIZED_DISK_ID} from optimized managed image" + fi + + set +x + # revoke any lingering access from a prior interrupted run so grant-access does not fail + az disk revoke-access --ids "${OPTIMIZED_DISK_ID}" >/dev/null 2>&1 || true + disk_sas_url=$(az disk grant-access --ids "${OPTIMIZED_DISK_ID}" --duration-in-seconds 1800 | jq -r '.accessSAS') + if [ -z "${disk_sas_url}" ] || [ "${disk_sas_url,,}" = "null" ] || [ "${disk_sas_url,,}" = "none" ]; then + echo "generated SAS URL for optimized managed disk is empty, cannot continue" return 1 fi - echo "pending copy operation over ${VHD_URI} has completed successfully" + echo "setting azcopy environment variables with pool identity: ${IMAGE_BUILDER_IDENTITY_ID}" + export AZCOPY_AUTO_LOGIN_TYPE="AZCLI" + export AZCOPY_CONCURRENCY_VALUE="AUTO" + echo "copying optimized disk ${OPTIMIZED_DISK_ID} to ${VHD_URI}" + azcopy copy "${disk_sas_url}" "${VHD_URI}" --recursive=true + azcopy_exit_code=$? + set -x + + az disk revoke-access --ids "${OPTIMIZED_DISK_ID}" || echo "unable to revoke access to ${OPTIMIZED_DISK_ID}, will proceed" + + if [ "${azcopy_exit_code}" -ne 0 ]; then + echo "failed to copy optimized disk ${OPTIMIZED_DISK_ID} to ${VHD_URI}" + return "${azcopy_exit_code}" + fi + + # mark the VHD as fully produced by prefetch optimization so retries of this step exit early + az storage blob metadata update --blob-url "${VHD_URI}" --auth-mode login --metadata prefetchOptimized=true || return $? + + echo "optimized VHD has been published to: ${VHD_URI}" } delete_vhd() { diff --git a/vhdbuilder/prefetch/templates/optimize.json b/vhdbuilder/prefetch/templates/optimize.json index 8bb0448d895..3d5c453cab4 100644 --- a/vhdbuilder/prefetch/templates/optimize.json +++ b/vhdbuilder/prefetch/templates/optimize.json @@ -33,9 +33,10 @@ }, "distribute": [ { - "type": "VHD", - "uri": "", - "runOutputName": "VHD" + "type": "ManagedImage", + "imageId": "", + "location": "", + "runOutputName": "ManagedImage" } ] } From deccd4731932f62d1fdbbade0073943a63930292 Mon Sep 17 00:00:00 2001 From: cameronmeissner Date: Tue, 30 Jun 2026 17:02:01 -0700 Subject: [PATCH 2/7] chore: try alternative approach --- vhdbuilder/prefetch/scripts/optimize.sh | 107 +++++--------------- vhdbuilder/prefetch/templates/optimize.json | 6 +- 2 files changed, 28 insertions(+), 85 deletions(-) diff --git a/vhdbuilder/prefetch/scripts/optimize.sh b/vhdbuilder/prefetch/scripts/optimize.sh index 25706588f7c..ee2d37150cb 100755 --- a/vhdbuilder/prefetch/scripts/optimize.sh +++ b/vhdbuilder/prefetch/scripts/optimize.sh @@ -28,16 +28,16 @@ CAPTURED_SIG_VERSION_ID="/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${SIG_ IMAGE_BUILDER_RG_NAME="image-builder-${CAPTURED_SIG_VERSION}-${BUILD_RUN_NUMBER}" IMAGE_BUILDER_TEMPLATE_NAME="template-${CAPTURED_SIG_VERSION}-${BUILD_RUN_NUMBER}" IMAGE_BUILDER_TEMPLATE_NAME="${IMAGE_BUILDER_TEMPLATE_NAME:0:64}" +IMAGE_BUILDER_TEMPLATE_ID="/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${IMAGE_BUILDER_RG_NAME}/providers/Microsoft.VirtualMachineImages/imageTemplates/${IMAGE_BUILDER_TEMPLATE_NAME}" VHD_URI="${STORAGE_ACCOUNT_BLOB_URL}/${VHD_NAME}" -# the image builder template distributes the prefetch-optimized image as a managed image into the -# temporary image builder resource group. we then convert that managed image into a VHD blob within -# the target storage account ourselves, which avoids the slow multi-copy VHD distribution performed -# by image builder when distributing directly to a VHD blob. -DISTRIBUTE_MANAGED_IMAGE_NAME="${CAPTURED_SIG_VERSION}-optimized" -DISTRIBUTE_MANAGED_IMAGE_ID="/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${IMAGE_BUILDER_RG_NAME}/providers/Microsoft.Compute/images/${DISTRIBUTE_MANAGED_IMAGE_NAME}" -OPTIMIZED_DISK_NAME="${CAPTURED_SIG_VERSION}-optimized" -OPTIMIZED_DISK_ID="/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${IMAGE_BUILDER_RG_NAME}/providers/Microsoft.Compute/disks/${OPTIMIZED_DISK_NAME}" +# the image builder template distributes the prefetch-optimized image as a VHD without specifying a +# target uri, which causes image builder to publish the VHD to a storage account within its own +# staging resource group. we then copy that staging VHD directly into the target storage account +# ourselves via azcopy, which avoids the slow second copy image builder performs when distributing +# directly to an external target storage account. the name of the run output is used to look up the +# resulting blob (artifactUri) after the template run completes. +DISTRIBUTE_RUN_OUTPUT_NAME="VHD" main() { # for idempotency, check to see if the VHD we're trying to create already exists. @@ -46,11 +46,11 @@ main() { # optimization + conversion flow check_for_existing_vhd - # attempt to perform prefetch optimization, distributing the optimized image as a managed image, - # then convert that managed image into a VHD blob within the target storage account + # attempt to perform prefetch optimization, distributing the optimized image as a VHD into the + # image builder staging storage account, then copy that VHD into the target storage account ensure_image_builder_rg || exit $? run_image_builder_template || exit $? - convert_managed_image_to_vhd || exit $? + copy_optimized_vhd || exit $? } check_for_existing_vhd() { @@ -91,7 +91,6 @@ run_image_builder_template() { -e "s##${SOURCE_TYPE}#g" \ -e "s##${SOURCE_ID_KEY}#g" \ -e "s##${SOURCE_ID}#g" \ - -e "s##${DISTRIBUTE_MANAGED_IMAGE_ID}#g" \ "${IMAGE_BUILDER_TEMPLATE_PATH}" > input.json || return $? if [ ! -f "input.json" ]; then @@ -125,7 +124,7 @@ run_image_builder_template() { return 1 fi - echo "template ${IMAGE_BUILDER_TEMPLATE_NAME} has ran to completion, optimized managed image has been published to: ${DISTRIBUTE_MANAGED_IMAGE_ID}" + echo "template ${IMAGE_BUILDER_TEMPLATE_NAME} has ran to completion, optimized VHD has been published to image builder staging storage" } need_new_template() { @@ -274,82 +273,28 @@ create_temp_storage() { TEMP_VHD_URI="${temp_vhd_uri}" } -wait_for_managed_image() { - # the managed image distributed by the image builder template should exist once the run succeeds, - # but the underlying ARM resource can take a brief moment to become visible after the run completes. - managed_image_id="" - attempts=0 - while [ "${attempts}" -lt 10 ]; do - managed_image_id="$(az image show -g "${IMAGE_BUILDER_RG_NAME}" -n "${DISTRIBUTE_MANAGED_IMAGE_NAME}" 2>/dev/null | jq -r '.id')" - if [ -n "${managed_image_id}" ] && [ "${managed_image_id,,}" != "null" ]; then - echo "found optimized managed image: ${managed_image_id}" - return 0 - fi - echo "optimized managed image ${DISTRIBUTE_MANAGED_IMAGE_NAME} not yet visible in ${IMAGE_BUILDER_RG_NAME}, will wait 30s before checking again" - sleep 30s - attempts=$((attempts + 1)) - done - echo "expected optimized managed image ${DISTRIBUTE_MANAGED_IMAGE_NAME} does not exist in ${IMAGE_BUILDER_RG_NAME}" - return 1 -} - -# convert_managed_image_to_vhd converts the prefetch-optimized managed image produced by the image -# builder template into a VHD blob within the target storage account. This replaces distributing a -# VHD directly from the image builder template, which performed multiple slow blob copies. The steps: -# 1. Create a managed disk in the image builder resource group from the optimized managed image -# 2. Grant temporary read (SAS) access to the disk -# 3. azcopy the disk directly into the target VHD blob in a single copy operation -# 4. Mark the VHD blob complete so retries of this step are idempotent -convert_managed_image_to_vhd() { - wait_for_managed_image || return $? - - if [ -z "$(az disk show --ids "${OPTIMIZED_DISK_ID}" | jq -r '.id')" ]; then - # CVM and TrustedLaunch images must carry their security type onto the managed disk, otherwise - # the disk cannot be created from the optimized managed image. Standard images get no security profile. - security_profile="" - if [ "${ENABLE_TRUSTED_LAUNCH,,}" = "true" ]; then - echo "optimized image is a TrustedLaunch flavor, will create managed disk with TrustedLaunch security type" - security_profile="\"securityProfile\": { \"securityType\": \"TrustedLaunch\" }, " - elif grep -q "cvm" <<< "$FEATURE_FLAGS"; then - echo "optimized image is a CVM flavor, will create managed disk with ConfidentialVM security type" - security_profile="\"securityProfile\": { \"securityType\": \"ConfidentialVM_VMGuestStateOnlyEncryptedWithPlatformKey\" }, " - fi - echo "creating managed disk ${OPTIMIZED_DISK_NAME} from optimized managed image ${DISTRIBUTE_MANAGED_IMAGE_ID}" - az resource create --id "${OPTIMIZED_DISK_ID}" --api-version "${MANAGED_DISK_API_VERSION}" --is-full-object --location "$LOCATION" --properties "{\"location\": \"$LOCATION\", \ - \"properties\": { \ - \"osType\": \"Linux\", \ - ${security_profile}\ - \"creationData\": { \ - \"createOption\": \"FromImage\", \ - \"imageReference\": { \ - \"id\": \"${DISTRIBUTE_MANAGED_IMAGE_ID}\" \ - } \ - } \ - } \ - }" || return $? - echo "created managed disk ${OPTIMIZED_DISK_ID} from optimized managed image" - fi - - set +x - # revoke any lingering access from a prior interrupted run so grant-access does not fail - az disk revoke-access --ids "${OPTIMIZED_DISK_ID}" >/dev/null 2>&1 || true - disk_sas_url=$(az disk grant-access --ids "${OPTIMIZED_DISK_ID}" --duration-in-seconds 1800 | jq -r '.accessSAS') - if [ -z "${disk_sas_url}" ] || [ "${disk_sas_url,,}" = "null" ] || [ "${disk_sas_url,,}" = "none" ]; then - echo "generated SAS URL for optimized managed disk is empty, cannot continue" +# copy_optimized_vhd copies the prefetch-optimized VHD that the image builder template published to its +# staging storage account directly into the target storage account. Because the template distributes a +# VHD without specifying a target uri, image builder publishes the blob to a storage account within its +# own staging resource group and exposes its location via the run output's artifactUri. We copy that +# blob into the target storage account ourselves with a single azcopy, then mark it complete so retries +# of this step are idempotent. +copy_optimized_vhd() { + artifact_uri="$(az resource show --ids "${IMAGE_BUILDER_TEMPLATE_ID}/runOutputs/${DISTRIBUTE_RUN_OUTPUT_NAME}" --api-version "${IMAGE_BUILDER_API_VERSION}" --query "properties.artifactUri" -o tsv 2>/dev/null)" + if [ -z "${artifact_uri}" ] || [ "${artifact_uri,,}" = "null" ] || [ "${artifact_uri,,}" = "none" ]; then + set -x + echo "unable to determine artifactUri for run output ${DISTRIBUTE_RUN_OUTPUT_NAME}, cannot continue" return 1 fi echo "setting azcopy environment variables with pool identity: ${IMAGE_BUILDER_IDENTITY_ID}" export AZCOPY_AUTO_LOGIN_TYPE="AZCLI" export AZCOPY_CONCURRENCY_VALUE="AUTO" - echo "copying optimized disk ${OPTIMIZED_DISK_ID} to ${VHD_URI}" - azcopy copy "${disk_sas_url}" "${VHD_URI}" --recursive=true + echo "copying optimized VHD from image builder staging storage to ${VHD_URI}" + azcopy copy "${artifact_uri}" "${VHD_URI}" --recursive=true azcopy_exit_code=$? - set -x - - az disk revoke-access --ids "${OPTIMIZED_DISK_ID}" || echo "unable to revoke access to ${OPTIMIZED_DISK_ID}, will proceed" if [ "${azcopy_exit_code}" -ne 0 ]; then - echo "failed to copy optimized disk ${OPTIMIZED_DISK_ID} to ${VHD_URI}" + echo "failed to copy optimized VHD to ${VHD_URI}" return "${azcopy_exit_code}" fi diff --git a/vhdbuilder/prefetch/templates/optimize.json b/vhdbuilder/prefetch/templates/optimize.json index 3d5c453cab4..a12df56966c 100644 --- a/vhdbuilder/prefetch/templates/optimize.json +++ b/vhdbuilder/prefetch/templates/optimize.json @@ -33,10 +33,8 @@ }, "distribute": [ { - "type": "ManagedImage", - "imageId": "", - "location": "", - "runOutputName": "ManagedImage" + "type": "VHD", + "runOutputName": "VHD" } ] } From 15b8eb575bae1b56ed73e9cdb86d29c2589cf053 Mon Sep 17 00:00:00 2001 From: cameronmeissner Date: Wed, 1 Jul 2026 08:39:50 -0700 Subject: [PATCH 3/7] chore: account for immutable vhd container --- .../templates/.builder-release-template.yaml | 1 + vhdbuilder/prefetch/scripts/optimize.sh | 88 ++++++++++++++----- 2 files changed, 67 insertions(+), 22 deletions(-) diff --git a/.pipelines/templates/.builder-release-template.yaml b/.pipelines/templates/.builder-release-template.yaml index 3f8e59bb583..955fe11acef 100644 --- a/.pipelines/templates/.builder-release-template.yaml +++ b/.pipelines/templates/.builder-release-template.yaml @@ -295,6 +295,7 @@ steps: SIG_IMAGE_NAME: $(SIG_IMAGE_NAME) SKU_NAME: $(SKU_NAME) STORAGE_ACCOUNT_BLOB_URL: $(CLASSIC_BLOB) + STORAGE_ACCOUNT_BLOB_URL_STAGING: $(CLASSIC_BLOB_STAGING) VHD_NAME: $(VHD_NAME) IMAGE_BUILDER_IDENTITY_ID: $(AZURE_MSI_RESOURCE_STRING) BUILD_RUN_NUMBER: $(Build.BuildNumber) diff --git a/vhdbuilder/prefetch/scripts/optimize.sh b/vhdbuilder/prefetch/scripts/optimize.sh index ee2d37150cb..e483fda0f93 100755 --- a/vhdbuilder/prefetch/scripts/optimize.sh +++ b/vhdbuilder/prefetch/scripts/optimize.sh @@ -11,6 +11,7 @@ set -uxo pipefail [ -z "${SIG_IMAGE_NAME:-}" ] && echo "SIG_IMAGE_NAME is not set" && exit 1 [ -z "${SKU_NAME:-}" ] && echo "SKU_NAME is not set" && exit 1 [ -z "${STORAGE_ACCOUNT_BLOB_URL:-}" ] && echo "STORAGE_ACCOUNT_BLOB_URL is not set" && exit 1 +[ -z "${STORAGE_ACCOUNT_BLOB_URL_STAGING:-}" ] && echo "STORAGE_ACCOUNT_BLOB_URL_STAGING is not set" && exit 1 [ -z "${VHD_NAME:-}" ] && echo "VHD_NAME is not set" && exit 1 [ -z "${IMAGE_BUILDER_IDENTITY_ID:-}" ] && echo "IMAGE_BUILDER_IDENTITY_ID is not set" && exit 1 [ -z "${BUILD_RUN_NUMBER:-}" ] && echo "BUILD_RUN_NUMBER is not set" && exit 1 @@ -33,12 +34,23 @@ VHD_URI="${STORAGE_ACCOUNT_BLOB_URL}/${VHD_NAME}" # the image builder template distributes the prefetch-optimized image as a VHD without specifying a # target uri, which causes image builder to publish the VHD to a storage account within its own -# staging resource group. we then copy that staging VHD directly into the target storage account -# ourselves via azcopy, which avoids the slow second copy image builder performs when distributing -# directly to an external target storage account. the name of the run output is used to look up the -# resulting blob (artifactUri) after the template run completes. +# staging resource group. we then copy that staging VHD into the target storage account ourselves, +# which avoids the slow second copy image builder performs when distributing directly to an external +# target storage account. the name of the run output is used to look up the resulting blob +# (artifactUri) after the template run completes. DISTRIBUTE_RUN_OUTPUT_NAME="VHD" +# the destination vhd container may have an immutability policy, which prevents azcopy from writing the VHD +# into it directly (azcopy creates the page blob and then writes to it, which the policy rejects as a +# modification of an existing blob). To work around this, we azcopy the VHD into the staging container +# (STORAGE_ACCOUNT_BLOB_URL_STAGING, which has no immutability policy), then perform a single server-side +# blob copy from the staging container into the immutable vhd container. The server-side copy creates the +# destination blob in one atomic operation, which the immutability policy allows. +DESTINATION_STORAGE_ACCOUNT_NAME="${STORAGE_ACCOUNT_BLOB_URL#*://}" +DESTINATION_STORAGE_ACCOUNT_NAME="${DESTINATION_STORAGE_ACCOUNT_NAME%%.*}" +DESTINATION_CONTAINER_NAME="${STORAGE_ACCOUNT_BLOB_URL##*/}" +STAGING_VHD_URI="${STORAGE_ACCOUNT_BLOB_URL_STAGING}/${VHD_NAME}" + main() { # for idempotency, check to see if the VHD we're trying to create already exists. # if it's already in the expected state, this will cause the script to exit early. @@ -61,15 +73,24 @@ check_for_existing_vhd() { fi echo "VHD already exists at: ${VHD_URI}" - # we mark the VHD with the prefetchOptimized metadata flag only after the optimized managed image - # has been fully copied into the target storage account. if the flag is present, the VHD is complete - # and there is nothing left to do. otherwise, the blob is left over from an incomplete run and must - # be deleted before retrying the optimization + conversion flow. - if [ "$(jq -r '.metadata.prefetchOptimized' <<< "${vhd_info}")" = "true" ]; then - echo "VHD ${VHD_URI} was fully produced by a previous prefetch optimization run, nothing to do" + # the VHD is produced by a server-side blob copy from the staging container, so its copy status tells + # us whether a previous run already completed. A successful copy means there is nothing to do; a + # pending copy just needs to be waited on. Any other state means the blob is in a bad state and must + # be recreated. + copy_status="$(jq -r '.properties.copy.status' <<< "${vhd_info}")" + if [ "${copy_status,,}" = "success" ]; then + echo "VHD ${VHD_URI} was already produced by a previous prefetch optimization run, nothing to do" exit 0 fi - echo "VHD ${VHD_URI} exists but was not produced by a prefetch optimization run, will delete before proceeding" + if [ "${copy_status,,}" = "pending" ]; then + echo "VHD ${VHD_URI} is currently being copied by a previous prefetch optimization run, will wait for completion" + if wait_for_vhd_copy; then + exit 0 + fi + echo "pending copy over ${VHD_URI} did not complete successfully, will delete existing blob and retry" + else + echo "VHD ${VHD_URI} exists in an unexpected copy state: '${copy_status}', will delete before proceeding" + fi delete_vhd || exit $? } @@ -274,36 +295,59 @@ create_temp_storage() { } # copy_optimized_vhd copies the prefetch-optimized VHD that the image builder template published to its -# staging storage account directly into the target storage account. Because the template distributes a -# VHD without specifying a target uri, image builder publishes the blob to a storage account within its -# own staging resource group and exposes its location via the run output's artifactUri. We copy that -# blob into the target storage account ourselves with a single azcopy, then mark it complete so retries -# of this step are idempotent. +# staging storage account into the target storage account. Because the template distributes a VHD +# without specifying a target uri, image builder publishes the blob to a storage account within its own +# staging resource group and exposes its location via the run output's artifactUri. The destination vhd +# container may have an immutability policy that blocks azcopy from writing to it directly, so we: +# 1. azcopy the optimized VHD into the staging container (STORAGE_ACCOUNT_BLOB_URL_STAGING) +# 2. perform a single server-side blob copy from the staging container into the immutable vhd container copy_optimized_vhd() { artifact_uri="$(az resource show --ids "${IMAGE_BUILDER_TEMPLATE_ID}/runOutputs/${DISTRIBUTE_RUN_OUTPUT_NAME}" --api-version "${IMAGE_BUILDER_API_VERSION}" --query "properties.artifactUri" -o tsv 2>/dev/null)" if [ -z "${artifact_uri}" ] || [ "${artifact_uri,,}" = "null" ] || [ "${artifact_uri,,}" = "none" ]; then - set -x echo "unable to determine artifactUri for run output ${DISTRIBUTE_RUN_OUTPUT_NAME}, cannot continue" return 1 fi echo "setting azcopy environment variables with pool identity: ${IMAGE_BUILDER_IDENTITY_ID}" export AZCOPY_AUTO_LOGIN_TYPE="AZCLI" export AZCOPY_CONCURRENCY_VALUE="AUTO" - echo "copying optimized VHD from image builder staging storage to ${VHD_URI}" - azcopy copy "${artifact_uri}" "${VHD_URI}" --recursive=true + echo "copying optimized VHD from image builder staging storage to ${STAGING_VHD_URI}" + azcopy copy "${artifact_uri}" "${STAGING_VHD_URI}" --recursive=true azcopy_exit_code=$? if [ "${azcopy_exit_code}" -ne 0 ]; then - echo "failed to copy optimized VHD to ${VHD_URI}" + echo "failed to copy optimized VHD to staging location ${STAGING_VHD_URI}" return "${azcopy_exit_code}" fi - # mark the VHD as fully produced by prefetch optimization so retries of this step exit early - az storage blob metadata update --blob-url "${VHD_URI}" --auth-mode login --metadata prefetchOptimized=true || return $? + # server-side copy the VHD from the staging container into the immutable vhd container. This creates + # the destination blob in a single operation, which is permitted by the container's immutability policy. + echo "starting server-side copy of ${STAGING_VHD_URI} to ${VHD_URI}" + az storage blob copy start \ + --account-name "${DESTINATION_STORAGE_ACCOUNT_NAME}" \ + --auth-mode login \ + --destination-container "${DESTINATION_CONTAINER_NAME}" \ + --destination-blob "${VHD_NAME}" \ + --source-uri "${STAGING_VHD_URI}" || return $? + + wait_for_vhd_copy || return $? echo "optimized VHD has been published to: ${VHD_URI}" } +wait_for_vhd_copy() { + copy_status="$(az storage blob show --blob-url "${VHD_URI}" --auth-mode login | jq -r '.properties.copy.status')" + while [ "${copy_status,,}" = "pending" ]; do + echo "server-side copy to ${VHD_URI} is still pending, will wait 30s before checking again" + sleep 30s + copy_status="$(az storage blob show --blob-url "${VHD_URI}" --auth-mode login | jq -r '.properties.copy.status')" + done + if [ "${copy_status,,}" != "success" ]; then + echo "server-side copy to ${VHD_URI} finished with unexpected status: ${copy_status}" + return 1 + fi + echo "server-side copy to ${VHD_URI} completed successfully" +} + delete_vhd() { az storage blob delete --blob-url "${VHD_URI}" --auth-mode login || return $? while [ -n "$(az storage blob show --blob-url "${VHD_URI}" --auth-mode login | jq -r '.name')" ]; do From 0c6bceecce404ccf651c29caf441af3778c5e577 Mon Sep 17 00:00:00 2001 From: cameronmeissner Date: Wed, 1 Jul 2026 13:33:06 -0700 Subject: [PATCH 4/7] chore: address feedback and improve idempotency --- vhdbuilder/prefetch/scripts/optimize.sh | 88 +++++++++++++++---------- 1 file changed, 52 insertions(+), 36 deletions(-) diff --git a/vhdbuilder/prefetch/scripts/optimize.sh b/vhdbuilder/prefetch/scripts/optimize.sh index e483fda0f93..e9449fc7201 100755 --- a/vhdbuilder/prefetch/scripts/optimize.sh +++ b/vhdbuilder/prefetch/scripts/optimize.sh @@ -52,10 +52,10 @@ DESTINATION_CONTAINER_NAME="${STORAGE_ACCOUNT_BLOB_URL##*/}" STAGING_VHD_URI="${STORAGE_ACCOUNT_BLOB_URL_STAGING}/${VHD_NAME}" main() { - # for idempotency, check to see if the VHD we're trying to create already exists. - # if it's already in the expected state, this will cause the script to exit early. - # otherwise, we delete any existing VHD in an unexpected state and retry the whole - # optimization + conversion flow + # for idempotency, check whether the target VHD already exists. If a previous run already produced it + # successfully, exit early; if a server-side copy is still in progress, wait for it to finish. Because + # the destination container may be immutable, a VHD left in a bad state cannot be deleted and recreated + # and is treated as fatal. check_for_existing_vhd # attempt to perform prefetch optimization, distributing the optimized image as a VHD into the @@ -66,18 +66,16 @@ main() { } check_for_existing_vhd() { - vhd_info="$(az storage blob show --blob-url "${VHD_URI}" --auth-mode login)" - if [ -z "${vhd_info}" ]; then + if [ "$(az storage blob exists --blob-url "${VHD_URI}" --auth-mode login | jq -r '.exists')" = "false" ]; then echo "no existing VHD was found at: ${VHD_URI}, will proceed with optimization and VHD creation" return 0 fi echo "VHD already exists at: ${VHD_URI}" - # the VHD is produced by a server-side blob copy from the staging container, so its copy status tells - # us whether a previous run already completed. A successful copy means there is nothing to do; a - # pending copy just needs to be waited on. Any other state means the blob is in a bad state and must - # be recreated. - copy_status="$(jq -r '.properties.copy.status' <<< "${vhd_info}")" + # the VHD in a potentially immutable container is produced by a server-side blob copy, so its copy status + # tells us whether a previous run already completed. Because the container may have an immutability policy, + # a VHD in a bad state cannot be deleted and recreated, so any non-success terminal state is fatal. + copy_status="$(az storage blob show --blob-url "${VHD_URI}" --auth-mode login | jq -r '.properties.copy.status')" if [ "${copy_status,,}" = "success" ]; then echo "VHD ${VHD_URI} was already produced by a previous prefetch optimization run, nothing to do" exit 0 @@ -87,11 +85,11 @@ check_for_existing_vhd() { if wait_for_vhd_copy; then exit 0 fi - echo "pending copy over ${VHD_URI} did not complete successfully, will delete existing blob and retry" - else - echo "VHD ${VHD_URI} exists in an unexpected copy state: '${copy_status}', will delete before proceeding" fi - delete_vhd || exit $? + # the VHD exists in a non-success state within the immutable container. It cannot be deleted due to + # the immutability policy, so there is nothing we can do to recover - fail the script. + echo "VHD ${VHD_URI} exists in an unrecoverable copy state: '${copy_status}'" + exit 1 } ensure_image_builder_rg() { @@ -197,7 +195,7 @@ prepare_source() { # 4. Create a new managed image in the build location from the temporary VHD blob, which will be used as the source image of the image builder template convert_specialized_sig_version_to_managed_image() { managed_image_name="${CAPTURED_SIG_VERSION}-template-source" - managed_image_id="$(az image show -g "${IMAGE_BUILDER_RG_NAME}" -n "${managed_image_name}" | jq -r '.id')" + managed_image_id="$(az image show -g "${IMAGE_BUILDER_RG_NAME}" -n "${managed_image_name}" | jq -r '.id // empty')" if [ -n "${managed_image_id}" ]; then echo "managed image source already exists: ${managed_image_id}" SOURCE_MANAGED_IMAGE_ID="${managed_image_id}" @@ -288,7 +286,6 @@ create_temp_storage() { echo "creating container \"${storage_container_name}\" within temporary storage account ${storage_account_name}" az storage container create --name "${storage_container_name}" --account-name "${storage_account_name}" --auth-mode login || return $? fi - temp_vhd_uri="https://${storage_account_name}.blob.core.windows.net/${storage_container_name}/${VHD_NAME}" echo "temp VHD URI is ${temp_vhd_uri}" TEMP_VHD_URI="${temp_vhd_uri}" @@ -302,21 +299,40 @@ create_temp_storage() { # 1. azcopy the optimized VHD into the staging container (STORAGE_ACCOUNT_BLOB_URL_STAGING) # 2. perform a single server-side blob copy from the staging container into the immutable vhd container copy_optimized_vhd() { - artifact_uri="$(az resource show --ids "${IMAGE_BUILDER_TEMPLATE_ID}/runOutputs/${DISTRIBUTE_RUN_OUTPUT_NAME}" --api-version "${IMAGE_BUILDER_API_VERSION}" --query "properties.artifactUri" -o tsv 2>/dev/null)" - if [ -z "${artifact_uri}" ] || [ "${artifact_uri,,}" = "null" ] || [ "${artifact_uri,,}" = "none" ]; then - echo "unable to determine artifactUri for run output ${DISTRIBUTE_RUN_OUTPUT_NAME}, cannot continue" - return 1 - fi - echo "setting azcopy environment variables with pool identity: ${IMAGE_BUILDER_IDENTITY_ID}" - export AZCOPY_AUTO_LOGIN_TYPE="AZCLI" - export AZCOPY_CONCURRENCY_VALUE="AUTO" - echo "copying optimized VHD from image builder staging storage to ${STAGING_VHD_URI}" - azcopy copy "${artifact_uri}" "${STAGING_VHD_URI}" --recursive=true - azcopy_exit_code=$? + # The staging container should never have an immutability policy, so copying from image builder staging storage is + # fully retriable. If a previous run already produced a complete staging blob (marked with the + # prefetched metadata flag) but failed before finishing the server-side copy into the + # immutable vhd container, reuse that staging blob and skip re-copying. An incomplete staging blob + # left over from an interrupted copy is deleted and recreated. + staging_info="$(az storage blob show --blob-url "${STAGING_VHD_URI}" --auth-mode login 2>/dev/null)" + staging_blob_name="$(jq -r '.name // empty' <<< "${staging_info}" 2>/dev/null)" + if [ -n "${staging_blob_name}" ] && [ "$(jq -r '.metadata.prefetched' <<< "${staging_info}")" = "true" ]; then + echo "optimized VHD is already present and complete in staging at ${STAGING_VHD_URI}, skipping copy from image builder staging storage" + else + if [ -n "${staging_blob_name}" ]; then + echo "found an incomplete staging VHD at ${STAGING_VHD_URI} from a previous run, will delete it before recopying" + delete_staging_vhd || return $? + fi + + artifact_uri="$(az resource show --ids "${IMAGE_BUILDER_TEMPLATE_ID}/runOutputs/${DISTRIBUTE_RUN_OUTPUT_NAME}" --api-version "${IMAGE_BUILDER_API_VERSION}" --query "properties.artifactUri" -o tsv 2>/dev/null)" + if [ -z "${artifact_uri}" ] || [ "${artifact_uri,,}" = "null" ] || [ "${artifact_uri,,}" = "none" ]; then + echo "unable to determine artifactUri for run output ${DISTRIBUTE_RUN_OUTPUT_NAME}, cannot continue" + return 1 + fi + echo "setting azcopy environment variables with pool identity: ${IMAGE_BUILDER_IDENTITY_ID}" + export AZCOPY_AUTO_LOGIN_TYPE="AZCLI" + export AZCOPY_CONCURRENCY_VALUE="AUTO" + echo "copying optimized VHD from image builder staging storage to ${STAGING_VHD_URI}" + azcopy copy "${artifact_uri}" "${STAGING_VHD_URI}" --recursive=true + azcopy_exit_code=$? + if [ "${azcopy_exit_code}" -ne 0 ]; then + echo "failed to copy optimized VHD to staging location ${STAGING_VHD_URI}" + return "${azcopy_exit_code}" + fi - if [ "${azcopy_exit_code}" -ne 0 ]; then - echo "failed to copy optimized VHD to staging location ${STAGING_VHD_URI}" - return "${azcopy_exit_code}" + # mark the staging blob complete so a later retry that failed before finishing the server-side + # copy can detect it and skip re-copying from image builder staging storage. + az storage blob metadata update --blob-url "${STAGING_VHD_URI}" --auth-mode login --metadata prefetched=true || return $? fi # server-side copy the VHD from the staging container into the immutable vhd container. This creates @@ -348,13 +364,13 @@ wait_for_vhd_copy() { echo "server-side copy to ${VHD_URI} completed successfully" } -delete_vhd() { - az storage blob delete --blob-url "${VHD_URI}" --auth-mode login || return $? - while [ -n "$(az storage blob show --blob-url "${VHD_URI}" --auth-mode login | jq -r '.name')" ]; do - echo "VHD ${VHD_URI} has yet to be deleted, will wait 30s before checking again" +delete_staging_vhd() { + az storage blob delete --blob-url "${STAGING_VHD_URI}" --auth-mode login || return $? + while [ -n "$(az storage blob show --blob-url "${STAGING_VHD_URI}" --auth-mode login 2>/dev/null | jq -r '.name // empty')" ]; do + echo "staging VHD ${STAGING_VHD_URI} has yet to be deleted, will wait 30s before checking again" sleep 30s done - echo "${VHD_URI} has been deleted" + echo "${STAGING_VHD_URI} has been deleted" } main "$@" From 30f36e0de39b0a773e34b888f6f2728815d9b4e0 Mon Sep 17 00:00:00 2001 From: cameronmeissner Date: Wed, 1 Jul 2026 14:23:21 -0700 Subject: [PATCH 5/7] chore: augment retry logic to handle AIB rate limits --- .../templates/.builder-release-template.yaml | 2 +- vhdbuilder/prefetch/scripts/optimize.sh | 42 +++++++++++++++++-- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/.pipelines/templates/.builder-release-template.yaml b/.pipelines/templates/.builder-release-template.yaml index 955fe11acef..8a0ee5d92e1 100644 --- a/.pipelines/templates/.builder-release-template.yaml +++ b/.pipelines/templates/.builder-release-template.yaml @@ -286,7 +286,7 @@ steps: eq(variables.PREFETCH_COMPATIBLE, 'True') ) displayName: Run Prefetch Optimization and Convert to VHD blob - retryCountOnTaskFailure: 10 + retryCountOnTaskFailure: 3 env: SUBSCRIPTION_ID: $(SUBSCRIPTION_ID) LOCATION: $(PACKER_BUILD_LOCATION) diff --git a/vhdbuilder/prefetch/scripts/optimize.sh b/vhdbuilder/prefetch/scripts/optimize.sh index e9449fc7201..54eb667ea47 100755 --- a/vhdbuilder/prefetch/scripts/optimize.sh +++ b/vhdbuilder/prefetch/scripts/optimize.sh @@ -24,6 +24,12 @@ set -uxo pipefail IMAGE_BUILDER_API_VERSION="2025-10-01" MANAGED_DISK_API_VERSION="2024-03-02" +# image builder rejects new template creations and runs when too many tasks from a single subscription +# are running or waiting (TooManyRequests). This limit clears slowly, so we back off heavily and retry a +# large number of times before giving up. +RATE_LIMIT_MAX_ATTEMPTS=15 +RATE_LIMIT_RETRY_DELAY_SECONDS=180 + IMAGE_BUILDER_TEMPLATE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)/../templates/optimize.json" CAPTURED_SIG_VERSION_ID="/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${SIG_GALLERY_RESOURCE_GROUP_NAME}/providers/Microsoft.Compute/galleries/${SIG_GALLERY_NAME}/images/${SIG_IMAGE_NAME}/versions/${CAPTURED_SIG_VERSION}" IMAGE_BUILDER_RG_NAME="image-builder-${CAPTURED_SIG_VERSION}-${BUILD_RUN_NUMBER}" @@ -118,7 +124,7 @@ run_image_builder_template() { fi echo "creating image builder template ${IMAGE_BUILDER_TEMPLATE_NAME} in resource group ${IMAGE_BUILDER_RG_NAME}" - az resource create -n "${IMAGE_BUILDER_TEMPLATE_NAME}" \ + retry_on_rate_limit az resource create -n "${IMAGE_BUILDER_TEMPLATE_NAME}" \ --properties @input.json \ --is-full-object \ --api-version "${IMAGE_BUILDER_API_VERSION}" \ @@ -126,11 +132,11 @@ run_image_builder_template() { --resource-group "${IMAGE_BUILDER_RG_NAME}" || return $? echo "image builder template ${IMAGE_BUILDER_TEMPLATE_NAME} has been created, starting run..." - az image builder run -n "${IMAGE_BUILDER_TEMPLATE_NAME}" -g "${IMAGE_BUILDER_RG_NAME}" + retry_on_rate_limit az image builder run -n "${IMAGE_BUILDER_TEMPLATE_NAME}" -g "${IMAGE_BUILDER_RG_NAME}" else if [ "$(az image builder show -n "${IMAGE_BUILDER_TEMPLATE_NAME}" -g "${IMAGE_BUILDER_RG_NAME}" | jq -r '.lastRunStatus')" = "null" ]; then echo "template ${IMAGE_BUILDER_TEMPLATE_NAME} has no lastRunStatus, will attempt to run..." - az image builder run -n "${IMAGE_BUILDER_TEMPLATE_NAME}" -g "${IMAGE_BUILDER_RG_NAME}" + retry_on_rate_limit az image builder run -n "${IMAGE_BUILDER_TEMPLATE_NAME}" -g "${IMAGE_BUILDER_RG_NAME}" else echo "will attempt to wait for image builder template ${IMAGE_BUILDER_TEMPLATE_NAME} to finish its last run..." az image builder wait -n "${IMAGE_BUILDER_TEMPLATE_NAME}" -g "${IMAGE_BUILDER_RG_NAME}" --custom "lastRunStatus.runState!='Running'" @@ -373,4 +379,34 @@ delete_staging_vhd() { echo "${STAGING_VHD_URI} has been deleted" } +# retry_on_rate_limit runs the provided command, retrying only when it fails because image builder has +# rate limited us (TooManyRequests - image builder rejects new work when too many tasks from a single +# subscription are running or waiting). Because this limit clears slowly, we back off heavily between +# attempts. Any other (non rate limit) failure is returned immediately without retrying. +retry_on_rate_limit() { + local attempt=1 + local exit_code + local logfile + set +x + logfile="$(mktemp)" + while true; do + # stream the command output live via tee while also capturing it so we can inspect it for the + # rate limit error; PIPESTATUS[0] preserves the wrapped command's exit code (not tee's). + "$@" 2>&1 | tee "${logfile}" + exit_code="${PIPESTATUS[0]}" + if [ "${exit_code}" -eq 0 ]; then + rm -f "${logfile}" + return 0 + fi + if ! grep -qi "TooManyRequests" "${logfile}" || [ "${attempt}" -ge "${RATE_LIMIT_MAX_ATTEMPTS}" ]; then + rm -f "${logfile}" + return "${exit_code}" + fi + echo "command was rate limited by image builder (attempt ${attempt}/${RATE_LIMIT_MAX_ATTEMPTS}), waiting ${RATE_LIMIT_RETRY_DELAY_SECONDS}s before retrying..." + sleep "${RATE_LIMIT_RETRY_DELAY_SECONDS}" + attempt=$((attempt + 1)) + done + set -x +} + main "$@" From 8374fb38e03a901382a19f0efd9a815a1bc92ad4 Mon Sep 17 00:00:00 2001 From: cameronmeissner Date: Wed, 1 Jul 2026 14:24:33 -0700 Subject: [PATCH 6/7] chore: fix --- vhdbuilder/prefetch/scripts/optimize.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vhdbuilder/prefetch/scripts/optimize.sh b/vhdbuilder/prefetch/scripts/optimize.sh index 54eb667ea47..d9d45be2188 100755 --- a/vhdbuilder/prefetch/scripts/optimize.sh +++ b/vhdbuilder/prefetch/scripts/optimize.sh @@ -396,17 +396,18 @@ retry_on_rate_limit() { exit_code="${PIPESTATUS[0]}" if [ "${exit_code}" -eq 0 ]; then rm -f "${logfile}" + set -x return 0 fi if ! grep -qi "TooManyRequests" "${logfile}" || [ "${attempt}" -ge "${RATE_LIMIT_MAX_ATTEMPTS}" ]; then rm -f "${logfile}" + set -x return "${exit_code}" fi echo "command was rate limited by image builder (attempt ${attempt}/${RATE_LIMIT_MAX_ATTEMPTS}), waiting ${RATE_LIMIT_RETRY_DELAY_SECONDS}s before retrying..." sleep "${RATE_LIMIT_RETRY_DELAY_SECONDS}" attempt=$((attempt + 1)) done - set -x } main "$@" From 3a9ccad0e412e94914d071f287045f8cbf57a346 Mon Sep 17 00:00:00 2001 From: cameronmeissner Date: Wed, 1 Jul 2026 15:54:00 -0700 Subject: [PATCH 7/7] chore: delete staging blob after successful copy to immutable container --- vhdbuilder/prefetch/scripts/optimize.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vhdbuilder/prefetch/scripts/optimize.sh b/vhdbuilder/prefetch/scripts/optimize.sh index d9d45be2188..be120dc7529 100755 --- a/vhdbuilder/prefetch/scripts/optimize.sh +++ b/vhdbuilder/prefetch/scripts/optimize.sh @@ -353,6 +353,11 @@ copy_optimized_vhd() { wait_for_vhd_copy || return $? + # the optimized VHD is now safely in the immutable vhd container, so the staging copy is no longer + # needed - delete it as best-effort cleanup so staging blobs do not accumulate in the destination account. + # Use no-wait since we don't need to wait for the blob to be deleted in this case. + delete_staging_vhd true || echo "unable to delete staging VHD ${STAGING_VHD_URI}, will proceed" + echo "optimized VHD has been published to: ${VHD_URI}" } @@ -371,7 +376,13 @@ wait_for_vhd_copy() { } delete_staging_vhd() { + # when no_wait is true, issue the delete and return immediately without polling for the blob to disappear. + local no_wait="${1:-false}" az storage blob delete --blob-url "${STAGING_VHD_URI}" --auth-mode login || return $? + if [ "${no_wait,,}" = "true" ]; then + echo "issued delete for staging VHD ${STAGING_VHD_URI} without waiting for completion" + return 0 + fi while [ -n "$(az storage blob show --blob-url "${STAGING_VHD_URI}" --auth-mode login 2>/dev/null | jq -r '.name // empty')" ]; do echo "staging VHD ${STAGING_VHD_URI} has yet to be deleted, will wait 30s before checking again" sleep 30s