Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions e2e/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,16 @@ const (
linuxExtensionExitCodeStr = `Enable failed: failed to execute command: command terminated with exit status=(\d+)`
)

// cseExitCodeOutboundConnFail is the CSE exit code for ERR_OUTBOUND_CONN_FAIL
// (see parts/linux/cloud-init/artifacts/cse_helpers.sh). It indicates the node's
// outbound connectivity preflight check (curl to mcr.microsoft.com, optionally routed
// through the e2e HTTP proxy) failed all retries and the script exited before kubelet
// started. In the e2e environment this is a known low-rate transient infrastructure
// flake rather than a product regression, so the harness retries node provisioning a
// bounded number of times to reduce PR-gate noise. A genuine regression fails on every
// attempt and still surfaces after the retry budget is exhausted.
const cseExitCodeOutboundConnFail = "50"

// test data used across multiple test cases
const (
encodedTestCert = "LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUgvVENDQmVXZ0F3SUJBZ0lRYUJZRTMvTTA4WEhZQ25OVm1jRkJjakFOQmdrcWhraUc5dzBCQVFzRkFEQnkKTVFzd0NRWURWUVFHRXdKVlV6RU9NQXdHQTFVRUNBd0ZWR1Y0WVhNeEVEQU9CZ05WQkFjTUIwaHZkWE4wYjI0eApFVEFQQmdOVkJBb01DRk5UVENCRGIzSndNUzR3TEFZRFZRUUREQ1ZUVTB3dVkyOXRJRVZXSUZOVFRDQkpiblJsCmNtMWxaR2xoZEdVZ1EwRWdVbE5CSUZJek1CNFhEVEl3TURRd01UQXdOVGd6TTFvWERUSXhNRGN4TmpBd05UZ3oKTTFvd2diMHhDekFKQmdOVkJBWVRBbFZUTVE0d0RBWURWUVFJREFWVVpYaGhjekVRTUE0R0ExVUVCd3dIU0c5MQpjM1J2YmpFUk1BOEdBMVVFQ2d3SVUxTk1JRU52Y25BeEZqQVVCZ05WQkFVVERVNVdNakF3T0RFMk1UUXlORE14CkZEQVNCZ05WQkFNTUMzZDNkeTV6YzJ3dVkyOXRNUjB3R3dZRFZRUVBEQlJRY21sMllYUmxJRTl5WjJGdWFYcGgKZEdsdmJqRVhNQlVHQ3lzR0FRUUJnamM4QWdFQ0RBWk9aWFpoWkdFeEV6QVJCZ3NyQmdFRUFZSTNQQUlCQXhNQwpWVk13Z2dFaU1BMEdDU3FHU0liM0RRRUJBUVVBQTRJQkR3QXdnZ0VLQW9JQkFRREhoZVJrYmIxRkNjN3hSS3N0CndLMEpJR2FLWTh0N0piUzJiUTJiNllJSkRnbkh1SVlIcUJyQ1VWNzlvZWxpa2tva1JrRnZjdnBhS2luRkhEUUgKVXBXRUk2UlVFUlltU0NnM084V2k0MnVPY1YyQjVaYWJtWENrd2R4WTVFY2w1MUJiTThVbkdkb0FHYmRObWlSbQpTbVRqY3MrbGhNeGc0ZkZZNmxCcGlFVkZpR1VqR1JSKzYxUjY3THo2VTRLSmVMTmNDbTA3UXdGWUtCbXBpMDhnCmR5Z1N2UmRVdzU1Sm9wcmVkaitWR3RqVWtCNGhGVDRHUVgvZ2h0NjlSbHF6Lys4dTBkRVFraHVVdXVjcnFhbG0KU0d5NDNIUndCZkRLRndZZVdNN0NQTWQ1ZS9kTyt0MDh0OFBianpWVFR2NWhRRENzRVlJVjJUN0FGSTlTY054TQpraDcvQWdNQkFBR2pnZ05CTUlJRFBUQWZCZ05WSFNNRUdEQVdnQlMvd1ZxSC95ajZRVDM5dDAva0hhK2dZVmdwCnZUQi9CZ2dyQmdFRkJRY0JBUVJ6TUhFd1RRWUlLd1lCQlFVSE1BS0dRV2gwZEhBNkx5OTNkM2N1YzNOc0xtTnYKYlM5eVpYQnZjMmwwYjNKNUwxTlRUR052YlMxVGRXSkRRUzFGVmkxVFUwd3RVbE5CTFRRd09UWXRVak11WTNKMApNQ0FHQ0NzR0FRVUZCekFCaGhSb2RIUndPaTh2YjJOemNITXVjM05zTG1OdmJUQWZCZ05WSFJFRUdEQVdnZ3QzCmQzY3VjM05zTG1OdmJZSUhjM05zTG1OdmJUQmZCZ05WSFNBRVdEQldNQWNHQldlQkRBRUJNQTBHQ3lxRWFBR0cKOW5jQ0JRRUJNRHdHRENzR0FRUUJncWt3QVFNQkJEQXNNQ29HQ0NzR0FRVUZCd0lCRmg1b2RIUndjem92TDNkMwpkeTV6YzJ3dVkyOXRMM0psY0c5emFYUnZjbmt3SFFZRFZSMGxCQll3RkFZSUt3WUJCUVVIQXdJR0NDc0dBUVVGCkJ3TUJNRWdHQTFVZEh3UkJNRDh3UGFBN29EbUdOMmgwZEhBNkx5OWpjbXh6TG5OemJDNWpiMjB2VTFOTVkyOXQKTFZOMVlrTkJMVVZXTFZOVFRDMVNVMEV0TkRBNU5pMVNNeTVqY213d0hRWURWUjBPQkJZRUZBREFGVUlhenc1cgpaSUhhcG5SeElVbnB3K0dMTUE0R0ExVWREd0VCL3dRRUF3SUZvRENDQVgwR0Npc0dBUVFCMW5rQ0JBSUVnZ0Z0CkJJSUJhUUZuQUhjQTlseVVMOUYzTUNJVVZCZ0lNSlJXanVOTkV4a3p2OThNTHlBTHpFN3haT01BQUFGeE0waG8KYndBQUJBTUFTREJHQWlFQTZ4ZWxpTlI4R2svNjNwWWRuUy92T3gvQ2pwdEVNRXY4OVdXaDEvdXJXSUVDSVFEeQpCcmVIVTI1RHp3dWtRYVJRandXNjU1WkxrcUNueGJ4UVdSaU9lbWo5SkFCMUFKUWd2QjZPMVkxc2lITWZnb3NpCkxBM1IyazFlYkUrVVBXSGJUaTlZVGFMQ0FBQUJjVE5JYU53QUFBUURBRVl3UkFJZ0dSRTR3emFiTlJkRDhrcS8KdkZQM3RRZTJobTB4NW5YdWxvd2g0SWJ3M2xrQ0lGWWIvM2xTRHBsUzdBY1I0citYcFd0RUtTVEZXSm1OQ1JiYwpYSnVyMlJHQkFIVUE3c0NWN28xeVpBK1M0OE81RzhjU28ybHFDWHRMYWhvVU9PWkhzc3Z0eGZrQUFBRnhNMGhvCjh3QUFCQU1BUmpCRUFpQjZJdmJvV3NzM1I0SXRWd2plYmw3RDN5b0ZhWDBORGgyZFdoaGd3Q3hySHdJZ0NmcTcKb2NNQzV0KzFqaTVNNXhhTG1QQzRJK1dYM0kvQVJrV1N5aU83SVFjd0RRWUpLb1pJaHZjTkFRRUxCUUFEZ2dJQgpBQ2V1dXI0UW51anFtZ3VTckhVM21oZitjSm9kelRRTnFvNHRkZStQRDEvZUZkWUFFTHU4eEYrMEF0N3hKaVBZCmk1Ukt3aWx5UDU2diszaVkyVDlsdzdTOFRKMDQxVkxoYUlLcDE0TXpTVXpSeWVvT0FzSjdRQURNQ2xIS1VEbEgKVVUycE51bzg4WTZpZ292VDNic253Sk5pRVFOcXltU1NZaGt0dzB0YWR1b3FqcVhuMDZnc1Zpb1dUVkRYeXNkNQpxRXg0dDZzSWdJY01tMjZZSDF2SnBDUUVoS3BjMnkwN2dSa2tsQlpSdE1qVGh2NGNYeXlNWDd1VGNkVDdBSkJQCnVlaWZDb1YyNUp4WHVvOGQ1MTM5Z3dQMUJBZTdJQlZQeDJ1N0tOL1V5T1hkWm13TWYvVG1GR3dEZENmc3lIZi8KWnNCMndMSG96VFlvQVZtUTlGb1UxSkxnY1ZpdnFKK3ZObEJoSFhobHhNZE4wajgwUjlOejZFSWdsUWplSzNPOApJL2NGR20vQjgrNDJoT2xDSWQ5WmR0bmRKY1JKVmppMHdEMHF3ZXZDYWZBOWpKbEh2L2pzRStJOVV6NmNwQ3loCnN3K2xyRmR4VWdxVTU4YXhxZUs4OUZSK05vNHEwSUlPK0ppMXJKS3I5bmtTQjBCcVhvelZuRTFZQi9LTHZkSXMKdVlaSnVxYjJwS2t1K3p6VDZnVXdIVVRadkJpTk90WEw0Tnh3Yy9LVDdXek9TZDJ3UDEwUUk4REtnNHZmaU5EcwpIV21CMWM0S2ppNmdPZ0E1dVNVemFHbXEvdjRWbmNLNVVyK245TGJmbmZMYzI4SjVmdC9Hb3Rpbk15RGszaWFyCkYxMFlscWNPbWVYMXVGbUtiZGkvWG9yR2xrQ29NRjNURHg4cm1wOURCaUIvCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0=" //nolint:lll
Expand Down
89 changes: 88 additions & 1 deletion e2e/vmss.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,100 @@ func compileAKSNodeController(ctx context.Context, arch string) (*os.File, error
return f, nil
}

// maxOutboundCSERetries bounds how many times node provisioning is retried when the
// CSE outbound connectivity preflight check fails (ERR_OUTBOUND_CONN_FAIL / exit 50).
// This is a known transient e2e-infrastructure flake; a genuine product regression
// fails on every attempt and still surfaces once the budget is exhausted.
const maxOutboundCSERetries = 2

func ConfigureAndCreateVMSS(ctx context.Context, s *Scenario) (*ScenarioVM, error) {
vm, err := CreateVMSSWithRetry(ctx, s)
var vm *ScenarioVM
var err error
for attempt := 0; ; attempt++ {
vm, err = CreateVMSSWithRetry(ctx, s)
if err == nil {
break
}
// Known transient e2e-infra flake: the CSE outbound connectivity preflight check
// (curl mcr.microsoft.com, optionally via the e2e proxy) intermittently fails all
// retries and exits ERR_OUTBOUND_CONN_FAIL (50) before kubelet starts. Recreate the
// node a bounded number of times to reduce PR-gate noise without masking real
// regressions, which fail consistently and survive the retry budget.
if attempt >= maxOutboundCSERetries || s.IsWindows() || config.Config.KeepVMSS {
break
}
// The VMExtensionProvisioningError returned by the create operation does not reliably
// embed the CSE status JSON, so classify the failure from the extension instance view
// (the same source getCustomScriptExtensionStatus parses) rather than string-matching
// the ARM error. Only the outbound preflight exit code is treated as retryable.
exitCode, ok := getLinuxCSEExitCode(ctx, s)
if !ok || exitCode != cseExitCodeOutboundConnFail {
break
}
toolkit.Logf(ctx, "CSE failed with ERR_OUTBOUND_CONN_FAIL (exit %s) on VMSS %q: known transient e2e outbound flake, recreating node (attempt %d/%d)", exitCode, s.Runtime.VMSSName, attempt+1, maxOutboundCSERetries)
deleteVMSSAndWait(ctx, s)
}
Comment on lines 83 to +109

skipTestIfSKUNotAvailableErr(s.T, err)

return vm, err
}

// getLinuxCSEExitCode queries the VMSS instance view and returns the Linux CSE exit code
// parsed from the CustomScript extension status. It reports ok=false when no parseable CSE
// exit code is available (e.g. Windows, a non-CSE failure, or the instance view is not yet
// populated). This is the reliable source of the exit code because the ARM provisioning
// error does not consistently carry the full CSE status payload.
func getLinuxCSEExitCode(ctx context.Context, s *Scenario) (string, bool) {
ctx, cancel := context.WithTimeout(context.WithoutCancel(ctx), time.Minute)
defer cancel()
pager := config.Azure.VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetVMsClientListOptions{
Expand: to.Ptr("instanceView"),
})
for pager.More() {
page, err := pager.NextPage(ctx)
if err != nil {
return "", false
}
for _, vmssVM := range page.Value {
if vmssVM.Properties == nil || vmssVM.Properties.InstanceView == nil {
continue
}
for _, extension := range vmssVM.Properties.InstanceView.Extensions {
for _, status := range extension.Statuses {
if status == nil {
continue
}
cseStatus, err := parseLinuxCSEMessage(*status)
if err != nil || cseStatus == nil || cseStatus.ExitCode == "" {
continue
}
return cseStatus.ExitCode, true
}
}
}
}
return "", false
}

// deleteVMSSAndWait synchronously deletes the scenario's VMSS so the same name can be
// safely reused on the next provisioning attempt. Unlike deleteVMSS (fire-and-forget at
// test cleanup), this waits for the delete to complete to avoid a create/delete conflict.
func deleteVMSSAndWait(ctx context.Context, s *Scenario) {
ctx, cancel := context.WithTimeout(context.WithoutCancel(ctx), 5*time.Minute)
defer cancel()
poller, err := config.Azure.VMSS.BeginDelete(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetsClientBeginDeleteOptions{
ForceDeletion: to.Ptr(true),
})
if err != nil {
s.T.Logf("failed to begin delete of vmss %q for retry: %s", s.Runtime.VMSSName, err)
return
}
if _, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions); err != nil {
s.T.Logf("failed to wait for delete of vmss %q for retry: %s", s.Runtime.VMSSName, err)
}
}

// CustomDataWithHack is similar to nodeconfigutils.CustomData, but it uses a hack to run new aks-node-controller binary.
// Original aks-node-controller isn't run because it fails systemd check validating aks-node-controller-config.json exists
// (check aks-node-controller.service for details).
Expand Down
92 changes: 92 additions & 0 deletions e2e/vmss_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package e2e

import (
"testing"

"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
"github.com/stretchr/testify/require"
)

// TestCSEExitCodeOutboundConnFail pins the exit code constant to the value emitted by
// ERR_OUTBOUND_CONN_FAIL in parts/linux/cloud-init/artifacts/cse_helpers.sh. If the
// product error code changes, this test forces the harness mitigation to be updated.
func TestCSEExitCodeOutboundConnFail(t *testing.T) {
require.Equal(t, "50", cseExitCodeOutboundConnFail)
}

// TestParseLinuxCSEMessageOutboundExitCode verifies that parseLinuxCSEMessage extracts the
// outbound-connectivity exit code from a real CustomScript extension instance-view status.
// getLinuxCSEExitCode relies on this parsing to classify the retryable e2e flake, so a
// change to the message format must be reflected here.
func TestParseLinuxCSEMessageOutboundExitCode(t *testing.T) {
tests := []struct {
name string
code string
message string
wantExitCode string
wantErr bool
}{
{
name: "well-formed CSE json with outbound exit code",
code: "ProvisioningState/failed/0",
message: `Enable failed: [stdout] { "ExitCode": "50", "Output": "+ exit 50" } [stderr]`,
wantExitCode: "50",
},
{
name: "unparsable body falls back to extension exit status",
code: "ProvisioningState/failed/0",
message: `Enable failed: failed to execute command: command terminated with exit status=50 [stdout]not-json[stderr]`,
wantExitCode: "50",
},
{
name: "well-formed CSE json with non-outbound exit code",
code: "ProvisioningState/failed/0",
message: `Enable failed: [stdout] { "ExitCode": "51", "Output": "+ exit 51" } [stderr]`,
wantExitCode: "51",
},
{
// Real Test_Ubuntu2204_HTTPSProxy_PrivateDNS/default failure: the outer extension
// wrapper and the CSE status both report 50.
name: "real outbound flake, outer exit 50 and cse exit 50",
code: "ProvisioningState/failed/0",
message: "failed to execute command: command terminated with exit status=50\n[stdout]\n" +
`{ "ExitCode": "50", "Output": "Processing manual pages under /usr/local/man...\n++ date\n+ echo 'man-db finished updates'\n+ exit 50", "Error": "", "ExecDuration": "155", "BootDatapoints": { "KubeletStartTime": "n/a" } }` +
"\n\n[stderr]\ndate: invalid date 'n/a'\n",
wantExitCode: "50",
},
{
// Real Test_Ubuntu2204_HTTPSProxy_PrivateDNS/scriptless_nbc failure: the outer
// extension wrapper reports exit status=1, but the CSE status reports 50. The
// classifier must read the CSE ExitCode field, not the outer wrapper.
name: "real outbound flake, outer exit 1 but cse exit 50",
code: "ProvisioningState/failed/0",
message: "failed to execute command: command terminated with exit status=1\n[stdout]\n" +
`{ "ExitCode": "50", "Output": "man-db finished updates\n+ exit 50", "Error": "", "ExecDuration": "70", "BootDatapoints": { "KubeletStartTime": "n/a" } }` +
"\n\n[stderr]\ndate\n",
wantExitCode: "50",
},
{
name: "no parsable body",
code: "ProvisioningState/failed/0",
message: `Enable failed with no parsable body`,
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
status := armcompute.InstanceViewStatus{
Code: to.Ptr(tt.code),
Message: to.Ptr(tt.message),
}
cseStatus, err := parseLinuxCSEMessage(status)
if tt.wantErr {
require.Error(t, err)
return
}
require.NoError(t, err)
require.NotNil(t, cseStatus)
require.Equal(t, tt.wantExitCode, cseStatus.ExitCode)
})
}
}
Loading