From a11e59ec14ea79d41f007821e5c652b06d350db8 Mon Sep 17 00:00:00 2001 From: Ganeshkumar Ashokavardhanan Date: Thu, 2 Jul 2026 15:34:34 -0700 Subject: [PATCH] test(e2e): serialize gallery image-version replication to fix GalleryImageNotFound Scenarios in different regions resolve the same freshly-built SIG image version in parallel (CachedPrepareVHD is keyed per image+region, so the regions run concurrently). Adding a region is a read-modify-write PUT of the version's TargetRegions, performed with no locking, so a stale PUT dropped a region another goroutine had just added -> the image became unavailable there and VMSS creation failed with GalleryImageNotFound. Observed on main: 20 failures, all "2404gen2containerd/1.1783016979.17372 is not available in southeastasia", after southeastasia and uaenorth replicated the same version concurrently. Serialize replication per image-version ID with an in-process mutex and re-read the live TargetRegions inside the lock before each update, so every replication builds on the current region set instead of clobbering concurrent additions. Distinct versions still replicate in parallel; only same-version region additions serialize. --- e2e/config/azure.go | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/e2e/config/azure.go b/e2e/config/azure.go index 66f6f96b360..e7c53d9624b 100644 --- a/e2e/config/azure.go +++ b/e2e/config/azure.go @@ -12,6 +12,7 @@ import ( "slices" "strconv" "strings" + "sync" "time" "github.com/Azure/agentbaker/e2e/toolkit" @@ -559,7 +560,49 @@ func (a *AzureClient) LatestSIGImageVersionByTag(ctx context.Context, image *Ima return VHDResourceID(*latestVersion.ID), nil } +// versionReplicationLocks serializes concurrent replications of the same gallery image version +// (keyed by version resource ID). Adding a target region is a read-modify-write of the version's +// TargetRegions, so concurrent updates from scenarios in different regions would otherwise clobber +// each other and leave the image unavailable in a dropped region (GalleryImageNotFound). +var versionReplicationLocks sync.Map + +func versionReplicationLock(versionID string) *sync.Mutex { + mu, _ := versionReplicationLocks.LoadOrStore(versionID, &sync.Mutex{}) + return mu.(*sync.Mutex) +} + +// refreshImageVersion re-reads the live gallery image version and updates version in place, so +// callers act on the current TargetRegions/ProvisioningState rather than a stale snapshot. +func (a *AzureClient) refreshImageVersion(ctx context.Context, image *Image, version *armcompute.GalleryImageVersion) error { + client, err := armcompute.NewGalleryImageVersionsClient(image.Gallery.SubscriptionID, a.Credential, a.ArmOptions) + if err != nil { + return fmt.Errorf("create image version client: %w", err) + } + resp, err := client.Get(ctx, image.Gallery.ResourceGroupName, image.Gallery.Name, image.Name, *version.Name, nil) + if err != nil { + return fmt.Errorf("get image version: %w", err) + } + *version = resp.GalleryImageVersion + return nil +} + func (a *AzureClient) ensureReplication(ctx context.Context, image *Image, version *armcompute.GalleryImageVersion, location string) error { + // Serialize replication of a given image version across concurrently-running scenarios. + // Adding a region is a read-modify-write PUT of the version's TargetRegions; scenarios in + // different regions resolve the same freshly-built version in parallel, so without + // serialization a stale PUT drops a region another goroutine just added and the image + // becomes unavailable there (GalleryImageNotFound). Lock per version ID and re-read the live + // TargetRegions inside the lock so every update builds on the current region set. + mu := versionReplicationLock(*version.ID) + mu.Lock() + defer mu.Unlock() + + // Re-read the live version under the lock: the snapshot passed in was captured (by List/Get) + // before the lock was acquired and may be missing regions added by another goroutine. + if err := a.refreshImageVersion(ctx, image, version); err != nil { + return fmt.Errorf("refreshing image version before replication: %w", err) + } + // Wait for any ongoing update operations to complete first if err := a.waitForVersionOperationCompletion(ctx, image, version); err != nil { return fmt.Errorf("waiting for version operation completion: %w", err)