diff --git a/.agents/skills/run-azure-e2e-tests/SKILL.md b/.agents/skills/run-azure-e2e-tests/SKILL.md new file mode 100644 index 000000000000..f6bd84b2d991 --- /dev/null +++ b/.agents/skills/run-azure-e2e-tests/SKILL.md @@ -0,0 +1,83 @@ +--- +name: run-azure-e2e-tests +description: 'Run Azure CAS end-to-end tests — per-suite execution with focus filtering, background execution, and local/CI workflows. Use when: running e2e tests, debugging test failures, adding new test suites.' +--- + +# E2E Tests for Azure CAS + +## Test Structure + +``` +cluster-autoscaler/cloudprovider/azure/test/ +├── suites/ +│ └── scaleup/ # Scale-up/down test +│ └── suite_test.go +├── pkg/ +│ └── environment/ # Shared Environment struct + helpers +│ └── environment.go +├── Makefile # Local + CI targets +└── go.mod +``` + +## Local Developer Workflow + +From `cluster-autoscaler/cloudprovider/azure/test/`: + +### First-time setup + +```bash +az login +make setup-cluster # Creates AKS + ACR + workload identity (~5 min) +make deploy-local # Builds + deploys CAS via skaffold (~1 min) +``` + +### Running tests + +```bash +export AZURE_SUBSCRIPTION_ID="$(az account show --query id -o tsv)" +export AZURE_RESOURCE_GROUP="MC_..." # Node resource group (printed by setup-cluster) + +make e2etests # Run all suites +make e2etests TEST_SUITE=scaleup # Run single suite +make e2etests FOCUS="scales up" # Focus filter +``` + +### After code changes + +```bash +make deploy-local # Rebuild + redeploy CAS +make e2etests TEST_SUITE=scaleup +``` + +### Utility commands + +- `make list-suites` — list available test suites +- `make validate-env` — check required env vars +- `make deploy-local-dev` — skaffold watch mode (auto-redeploy on changes) + +### Background execution (survives VPN drops) + +```bash +nohup make e2etests TEST_SUITE=scaleup > e2e.log 2>&1 & +tail -f e2e.log +``` + +## CI (Prow) + +`make test-e2e` builds the CAS image and deploys via Helm (inside BeforeSuite), using cluster info from CAPZ. The Helm deploy is triggered by `-cas-image-repository` and `-cas-image-tag` flags — when absent (local path), Helm is skipped. + +## Monitoring + +- **Logs**: `tail -f e2e.log` +- **Cluster**: `kubectl get nodes,pods -w` +- **Events**: `kubectl get events -A --field-selector source=cluster-autoscaler --watch` +- **VMSS**: `az vmss list -g $AZURE_RESOURCE_GROUP -o table` +- **CAS logs**: `kubectl logs -n kube-system deploy/cluster-autoscaler -f` + +## Adding a New Test Suite + +1. Create `test/suites//suite_test.go` +2. Import `pkg/environment` for shared helpers +3. Register `-resource-group` flag in `init()` +4. Create `Environment` in `BeforeSuite`, call `EnsureHelmRelease(...)` for CI compatibility +5. Run: `make e2etests TEST_SUITE=` diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 000000000000..cc7cc540ad01 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,43 @@ +{ + "name": "Azure CAS Dev", + "image": "mcr.microsoft.com/devcontainers/go:1.22", + "runArgs": ["--cap-add=SYS_PTRACE", "--security-opt", "seccomp=unconfined"], + + "customizations": { + "vscode": { + "settings": { + "go.toolsManagement.checkForUpdates": "local", + "go.useLanguageServer": true, + "go.gopath": "/go", + "chat.useAgentSkills": true + }, + "extensions": [ + "golang.Go", + "ms-kubernetes-tools.vscode-kubernetes-tools", + "ms-kubernetes-tools.vscode-aks-tools", + "ms-azuretools.vscode-bicep", + "GitHub.vscode-pull-request-github", + "GitHub.copilot-chat" + ] + } + }, + + "features": { + "ghcr.io/devcontainers/features/docker-outside-of-docker:1": {}, + "ghcr.io/devcontainers/features/kubectl-helm-minikube:1": { + "helm": "latest", + "minikube": "none" + }, + "ghcr.io/devcontainers/features/azure-cli:1": {}, + "ghcr.io/devcontainers/features/github-cli:1": {}, + "ghcr.io/rio/features/skaffold:2": {} + }, + + "postCreateCommand": { + "install ko": "go install github.com/google/ko@latest", + "install yq": "go install github.com/mikefarah/yq/v4@latest", + "disable skaffold metrics": "skaffold config set --global collect-metrics false" + }, + + "remoteUser": "vscode" +} diff --git a/cluster-autoscaler/cloudprovider/azure/examples/dev/aks-dev-deploy.sh b/cluster-autoscaler/cloudprovider/azure/examples/dev/aks-dev-deploy.sh index f66cec602163..05a6bec18c23 100755 --- a/cluster-autoscaler/cloudprovider/azure/examples/dev/aks-dev-deploy.sh +++ b/cluster-autoscaler/cloudprovider/azure/examples/dev/aks-dev-deploy.sh @@ -69,6 +69,27 @@ yq '(.. | select(tag == "!!str")) |= envsubst(nu)' \ cluster-autoscaler-vmss-wi-dynamic.yaml.tpl > \ cluster-autoscaler-vmss-wi-dynamic.yaml +# create the dynamic node group config (required by the AKS fork) +kubectl apply -f - <\033[0m\n"} \ + /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-25s\033[0m %s\n", $$1, $$2 } \ + /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) }' $(MAKEFILE_LIST) + +##@ Cluster Setup (one-time) + +.PHONY: setup-cluster +setup-cluster: ## Create AKS cluster + ACR + workload identity for e2e testing + cd $(DEV_DIR) && bash ./aks-dev-deploy.sh + +##@ Build & Deploy + +.PHONY: deploy-local +deploy-local: ## Build CAS and deploy to cluster via skaffold + cd $(CAS_ROOT) && skaffold run --filename cloudprovider/azure/examples/dev/skaffold.yaml + +.PHONY: deploy-local-dev +deploy-local-dev: ## Build + deploy CAS in watch mode (auto-redeploy on changes) + cd $(CAS_ROOT) && skaffold dev --filename cloudprovider/azure/examples/dev/skaffold.yaml + +##@ E2E Testing + +.PHONY: e2etests +e2etests: ## Run e2e tests (CAS must already be deployed) + go run github.com/onsi/ginkgo/v2/ginkgo \ + --tags e2e \ + -v --trace \ + --timeout $(TEST_TIMEOUT) \ + --output-dir "$(ARTIFACTS)" \ + --junit-report="junit.e2e_suite.1.xml" \ + $(if $(FOCUS),--focus="$(FOCUS)",) \ + $(if $(LABEL_FILTER),--label-filter="$(LABEL_FILTER)",) \ + ./suites/$$(echo $(TEST_SUITE) | tr A-Z a-z)/... -- \ + -resource-group="$(AZURE_RESOURCE_GROUP)" + +##@ Utilities + +.PHONY: list-suites +list-suites: ## List available test suites + @find suites -mindepth 1 -maxdepth 1 -type d -printf '%f\n' 2>/dev/null || echo "No suites found." + +.PHONY: validate-env +validate-env: ## Check required environment variables + @missing=""; \ + for var in AZURE_SUBSCRIPTION_ID AZURE_RESOURCE_GROUP; do \ + eval val=\$$$$var; \ + if [ -z "$$val" ]; then missing="$$missing $$var"; fi; \ + done; \ + if [ -n "$$missing" ]; then \ + echo "ERROR: Missing required environment variables:$$missing"; \ + exit 1; \ + fi; \ + echo "All required environment variables are set." diff --git a/cluster-autoscaler/cloudprovider/azure/test/e2e/e2e_suite_test.go b/cluster-autoscaler/cloudprovider/azure/test/e2e/e2e_suite_test.go deleted file mode 100644 index a0d4a3a49ebb..000000000000 --- a/cluster-autoscaler/cloudprovider/azure/test/e2e/e2e_suite_test.go +++ /dev/null @@ -1,177 +0,0 @@ -//go:build e2e - -/* -Copyright 2024 The Kubernetes Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package e2e_test - -import ( - "context" - "errors" - "flag" - "fmt" - "os" - "testing" - "time" - - "github.com/Azure/azure-sdk-for-go/sdk/azidentity" - "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5" - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" - "helm.sh/helm/v3/pkg/action" - "helm.sh/helm/v3/pkg/chart/loader" - "helm.sh/helm/v3/pkg/cli" - "helm.sh/helm/v3/pkg/storage/driver" - corev1 "k8s.io/api/core/v1" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -const ( - casReleaseName = "cluster-autoscaler" -) - -var ( - ctx = context.Background() - vmss *armcompute.VirtualMachineScaleSetsClient - k8s client.Client - helmEnv = cli.New() - - resourceGroup string - clusterName string - clientID string - casNamespace string - casServiceAccountName string - casImageRepository string - casImageTag string -) - -func init() { - flag.StringVar(&resourceGroup, "resource-group", "", "resource group containing cluster-autoscaler-managed resources") - flag.StringVar(&clusterName, "cluster-name", "", "Cluster API Cluster name for the cluster to be managed by cluster-autoscaler") - flag.StringVar(&clientID, "client-id", "", "Azure client ID to be used by cluster-autoscaler") - flag.StringVar(&casNamespace, "cas-namespace", "", "Namespace in which to install cluster-autoscaler") - flag.StringVar(&casServiceAccountName, "cas-serviceaccount-name", "", "Name of the ServiceAccount to be used by cluster-autoscaler") - flag.StringVar(&casImageRepository, "cas-image-repository", "", "Repository of the container image for cluster-autoscaler") - flag.StringVar(&casImageTag, "cas-image-tag", "", "Tag of the container image for cluster-autoscaler") -} - -func TestE2E(t *testing.T) { - RegisterFailHandler(Fail) - RunSpecs(t, "e2e Suite") -} - -var _ = BeforeSuite(func() { - azCred, err := azidentity.NewDefaultAzureCredential(nil) - Expect(err).NotTo(HaveOccurred()) - vmss, err = armcompute.NewVirtualMachineScaleSetsClient(os.Getenv("AZURE_SUBSCRIPTION_ID"), azCred, nil) - Expect(err).NotTo(HaveOccurred()) - - restConfig, err := helmEnv.RESTClientGetter().ToRESTConfig() - Expect(err).NotTo(HaveOccurred()) - k8s, err = client.New(restConfig, client.Options{}) - Expect(err).NotTo(HaveOccurred()) - - ensureHelmValues(map[string]interface{}{ - "cloudProvider": "azure", - "azureTenantID": os.Getenv("AZURE_TENANT_ID"), - "azureSubscriptionID": os.Getenv("AZURE_SUBSCRIPTION_ID"), - "azureUseWorkloadIdentityExtension": true, - "azureResourceGroup": resourceGroup, - "podLabels": map[string]interface{}{ - "azure.workload.identity/use": "true", - }, - "rbac": map[string]interface{}{ - "serviceAccount": map[string]interface{}{ - "name": casServiceAccountName, - "annotations": map[string]interface{}{ - "azure.workload.identity/tenant-id": os.Getenv("AZURE_TENANT_ID"), - "azure.workload.identity/client-id": clientID, - }, - }, - }, - "autoDiscovery": map[string]interface{}{ - "clusterName": clusterName, - }, - "nodeSelector": map[string]interface{}{ - "kubernetes.io/os": "linux", - }, - "image": map[string]interface{}{ - "repository": casImageRepository, - "tag": casImageTag, - "pullPolicy": "Always", - }, - }) -}) - -func allVMSSStable(g Gomega) { - pager := vmss.NewListPager(resourceGroup, nil) - expectedNodes := 0 - for pager.More() { - page, err := pager.NextPage(ctx) - g.Expect(err).NotTo(HaveOccurred()) - for _, scaleset := range page.Value { - g.Expect(*scaleset.Properties.ProvisioningState).To(Equal("Succeeded")) - expectedNodes += int(*scaleset.SKU.Capacity) - } - } - - nodes := &corev1.NodeList{} - g.Expect(k8s.List(ctx, nodes)).To(Succeed()) - g.Expect(nodes.Items).To(SatisfyAll( - HaveLen(int(expectedNodes)), - ContainElements(Satisfy(func(node corev1.Node) bool { - for _, cond := range node.Status.Conditions { - if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue { - return true - } - } - return false - })), - )) -} - -func ensureHelmValues(values map[string]interface{}) { - helmCfg := new(action.Configuration) - Expect(helmCfg.Init(helmEnv.RESTClientGetter(), casNamespace, "secret", func(format string, v ...interface{}) { - GinkgoLogr.Info(fmt.Sprintf(format, v...)) - })).To(Succeed()) - - chart, err := loader.Load("../../../../../charts/cluster-autoscaler") - Expect(err).NotTo(HaveOccurred()) - - get := action.NewGet(helmCfg) - _, err = get.Run(casReleaseName) - if errors.Is(err, driver.ErrReleaseNotFound) { - install := action.NewInstall(helmCfg) - install.Timeout = 5 * time.Minute - install.Wait = true - install.CreateNamespace = true - install.ReleaseName = casReleaseName - install.Namespace = casNamespace - _, err := install.Run(chart, values) - Expect(err).NotTo(HaveOccurred()) - return - } else { - Expect(err).NotTo(HaveOccurred()) - } - - upgrade := action.NewUpgrade(helmCfg) - upgrade.Timeout = 5 * time.Minute - upgrade.Wait = true - upgrade.ReuseValues = true - _, err = upgrade.Run(casReleaseName, chart, values) - Expect(err).NotTo(HaveOccurred()) -} diff --git a/cluster-autoscaler/cloudprovider/azure/test/pkg/environment/environment.go b/cluster-autoscaler/cloudprovider/azure/test/pkg/environment/environment.go new file mode 100644 index 000000000000..be1938965323 --- /dev/null +++ b/cluster-autoscaler/cloudprovider/azure/test/pkg/environment/environment.go @@ -0,0 +1,226 @@ +//go:build e2e + +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package environment provides a shared test environment for Azure CAS e2e tests. +// The environment provides K8s and Azure clients for test assertions, +// and optionally deploys CAS via Helm (for CI) or assumes it's already running (for local dev). +package environment + +import ( + "context" + "errors" + "fmt" + "os" + "time" + + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "helm.sh/helm/v3/pkg/action" + "helm.sh/helm/v3/pkg/chart/loader" + "helm.sh/helm/v3/pkg/cli" + "helm.sh/helm/v3/pkg/storage/driver" + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/config" +) + +const casReleaseName = "cluster-autoscaler" + +// HelmConfig holds optional Helm deployment configuration. +// When populated, BeforeSuite deploys CAS via Helm (CI path). +// When empty, tests assume CAS is already running (local dev path). +type HelmConfig struct { + ChartPath string + ClusterName string + ClientID string + CASNamespace string + CASServiceAccountName string + CASImageRepository string + CASImageTag string +} + +// IsEnabled returns true if Helm deployment is configured. +func (h *HelmConfig) IsEnabled() bool { + return h != nil && h.CASImageRepository != "" && h.CASImageTag != "" +} + +// Environment holds all clients and configuration for an e2e test suite. +type Environment struct { + Ctx context.Context + K8s client.Client + VMSS *armcompute.VirtualMachineScaleSetsClient + ResourceGroup string + SubscriptionID string + TenantID string + Helm *HelmConfig +} + +// NewEnvironment creates a fully initialized Environment. +func NewEnvironment(resourceGroup string, helm *HelmConfig) *Environment { + env := &Environment{ + Ctx: context.Background(), + ResourceGroup: resourceGroup, + SubscriptionID: os.Getenv("AZURE_SUBSCRIPTION_ID"), + TenantID: os.Getenv("AZURE_TENANT_ID"), + Helm: helm, + } + + azCred, err := azidentity.NewDefaultAzureCredential(nil) + Expect(err).NotTo(HaveOccurred()) + + env.VMSS, err = armcompute.NewVirtualMachineScaleSetsClient(env.SubscriptionID, azCred, nil) + Expect(err).NotTo(HaveOccurred()) + + restConfig, err := config.GetConfig() + Expect(err).NotTo(HaveOccurred()) + env.K8s, err = client.New(restConfig, client.Options{}) + Expect(err).NotTo(HaveOccurred()) + + return env +} + +// --- Helm helpers --- + +// EnsureHelmRelease deploys or updates CAS via Helm if HelmConfig is enabled. +// If Helm is not configured, this is a no-op (CAS is managed externally). +func (env *Environment) EnsureHelmRelease(extraValues map[string]interface{}) { + if !env.Helm.IsEnabled() { + GinkgoLogr.Info("Helm not configured — assuming CAS is already deployed (e.g., via skaffold)") + return + } + + values := map[string]interface{}{ + "cloudProvider": "azure", + "azureTenantID": env.TenantID, + "azureSubscriptionID": env.SubscriptionID, + "azureUseWorkloadIdentityExtension": true, + "azureResourceGroup": env.ResourceGroup, + "podLabels": map[string]interface{}{ + "azure.workload.identity/use": "true", + }, + "rbac": map[string]interface{}{ + "serviceAccount": map[string]interface{}{ + "name": env.Helm.CASServiceAccountName, + "annotations": map[string]interface{}{ + "azure.workload.identity/tenant-id": env.TenantID, + "azure.workload.identity/client-id": env.Helm.ClientID, + }, + }, + }, + "autoDiscovery": map[string]interface{}{ + "clusterName": env.Helm.ClusterName, + }, + "nodeSelector": map[string]interface{}{ + "kubernetes.io/os": "linux", + }, + "image": map[string]interface{}{ + "repository": env.Helm.CASImageRepository, + "tag": env.Helm.CASImageTag, + "pullPolicy": "Always", + }, + } + + // Merge extra values (e.g., extraArgs for scale-down tuning) + for k, v := range extraValues { + values[k] = v + } + + helmEnv := cli.New() + helmCfg := new(action.Configuration) + Expect(helmCfg.Init(helmEnv.RESTClientGetter(), env.Helm.CASNamespace, "secret", func(format string, v ...interface{}) { + GinkgoLogr.Info(fmt.Sprintf(format, v...)) + })).To(Succeed()) + + chart, err := loader.Load(env.Helm.ChartPath) + Expect(err).NotTo(HaveOccurred()) + + get := action.NewGet(helmCfg) + _, err = get.Run(casReleaseName) + if errors.Is(err, driver.ErrReleaseNotFound) { + install := action.NewInstall(helmCfg) + install.Timeout = 5 * time.Minute + install.Wait = true + install.CreateNamespace = true + install.ReleaseName = casReleaseName + install.Namespace = env.Helm.CASNamespace + _, err = install.Run(chart, values) + Expect(err).NotTo(HaveOccurred()) + return + } + Expect(err).NotTo(HaveOccurred()) + + upgrade := action.NewUpgrade(helmCfg) + upgrade.Timeout = 5 * time.Minute + upgrade.Wait = true + upgrade.ReuseValues = true + _, err = upgrade.Run(casReleaseName, chart, values) + Expect(err).NotTo(HaveOccurred()) +} + +// --- VMSS helpers --- + +// AllVMSSStable checks that all VMSS in the resource group have Succeeded +// provisioning state and the number of Ready K8s nodes matches total VMSS capacity. +func (env *Environment) AllVMSSStable(g Gomega) { + pager := env.VMSS.NewListPager(env.ResourceGroup, nil) + expectedNodes := 0 + for pager.More() { + page, err := pager.NextPage(env.Ctx) + g.Expect(err).NotTo(HaveOccurred()) + for _, scaleset := range page.Value { + g.Expect(*scaleset.Properties.ProvisioningState).To(Equal("Succeeded")) + expectedNodes += int(*scaleset.SKU.Capacity) + } + } + + nodes := &corev1.NodeList{} + g.Expect(env.K8s.List(env.Ctx, nodes)).To(Succeed()) + g.Expect(nodes.Items).To(SatisfyAll( + HaveLen(expectedNodes), + ContainElements(Satisfy(func(node corev1.Node) bool { + for _, cond := range node.Status.Conditions { + if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue { + return true + } + } + return false + })), + )) +} + +// --- K8s helpers --- + +// ReadyNodeCount returns the number of Ready nodes in the cluster. +func (env *Environment) ReadyNodeCount() (int, error) { + readyCount := 0 + nodes := &corev1.NodeList{} + if err := env.K8s.List(env.Ctx, nodes); err != nil { + return 0, err + } + for _, node := range nodes.Items { + for _, cond := range node.Status.Conditions { + if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue { + readyCount++ + break + } + } + } + return readyCount, nil +} diff --git a/cluster-autoscaler/cloudprovider/azure/test/e2e/azure_test.go b/cluster-autoscaler/cloudprovider/azure/test/suites/scaleup/suite_test.go similarity index 50% rename from cluster-autoscaler/cloudprovider/azure/test/e2e/azure_test.go rename to cluster-autoscaler/cloudprovider/azure/test/suites/scaleup/suite_test.go index 896fbab25dbe..69d99cde2624 100644 --- a/cluster-autoscaler/cloudprovider/azure/test/e2e/azure_test.go +++ b/cluster-autoscaler/cloudprovider/azure/test/suites/scaleup/suite_test.go @@ -16,9 +16,12 @@ See the License for the specific language governing permissions and limitations under the License. */ -package e2e_test +package scaleup_test import ( + "flag" + "testing" + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" appsv1 "k8s.io/api/apps/v1" @@ -28,50 +31,91 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" + + "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/azure/test/pkg/environment" +) + +var ( + env *environment.Environment + resourceGroup string + + // Helm deployment flags (CI path). When image repo+tag are set, CAS is deployed via Helm. + // When empty, CAS is assumed to already be running (local dev path via skaffold). + clusterName string + clientID string + casNamespace string + casServiceAccountName string + casImageRepository string + casImageTag string ) +func init() { + flag.StringVar(&resourceGroup, "resource-group", "", "resource group containing cluster-autoscaler-managed resources (the MC_ node resource group)") + flag.StringVar(&clusterName, "cluster-name", "", "Cluster API Cluster name (CI only)") + flag.StringVar(&clientID, "client-id", "", "Azure client ID for CAS workload identity (CI only)") + flag.StringVar(&casNamespace, "cas-namespace", "default", "Namespace for CAS Helm release (CI only)") + flag.StringVar(&casServiceAccountName, "cas-serviceaccount-name", "cluster-autoscaler", "CAS ServiceAccount name (CI only)") + flag.StringVar(&casImageRepository, "cas-image-repository", "", "CAS image repository (CI only, triggers Helm deploy)") + flag.StringVar(&casImageTag, "cas-image-tag", "", "CAS image tag (CI only, triggers Helm deploy)") +} + +func TestScaleUp(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Scale Up Suite") +} + +var _ = BeforeSuite(func() { + var helm *environment.HelmConfig + if casImageRepository != "" && casImageTag != "" { + helm = &environment.HelmConfig{ + // From suites/scaleup/, 6 levels up reaches the repo root where charts/ lives. + ChartPath: "../../../../../../charts/cluster-autoscaler", + ClusterName: clusterName, + ClientID: clientID, + CASNamespace: casNamespace, + CASServiceAccountName: casServiceAccountName, + CASImageRepository: casImageRepository, + CASImageTag: casImageTag, + } + } + env = environment.NewEnvironment(resourceGroup, helm) + env.EnsureHelmRelease(map[string]interface{}{ + "extraArgs": map[string]interface{}{ + "scale-down-delay-after-add": "10s", + "scale-down-unneeded-time": "10s", + "scale-down-candidates-pool-ratio": "1.0", + "unremovable-node-recheck-timeout": "10s", + "skip-nodes-with-system-pods": "false", + "skip-nodes-with-local-storage": "false", + }, + }) +}) + var _ = Describe("Azure Provider", func() { - var ( - namespace *corev1.Namespace - ) + var namespace *corev1.Namespace BeforeEach(func() { - Eventually(allVMSSStable, "10m", "30s").Should(Succeed()) - + Eventually(env.AllVMSSStable, "10m", "30s").Should(Succeed()) namespace = &corev1.Namespace{ - ObjectMeta: metav1.ObjectMeta{ - GenerateName: "azure-e2e-", - }, + ObjectMeta: metav1.ObjectMeta{GenerateName: "azure-e2e-"}, } - Expect(k8s.Create(ctx, namespace)).To(Succeed()) + Expect(env.K8s.Create(env.Ctx, namespace)).To(Succeed()) }) AfterEach(func() { - Expect(k8s.Delete(ctx, namespace)).To(Succeed()) + Expect(env.K8s.Delete(env.Ctx, namespace)).To(Succeed()) Eventually(func() bool { - err := k8s.Get(ctx, client.ObjectKeyFromObject(namespace), &corev1.Namespace{}) + err := env.K8s.Get(env.Ctx, client.ObjectKeyFromObject(namespace), &corev1.Namespace{}) return apierrors.IsNotFound(err) }, "1m", "5s").Should(BeTrue(), "Namespace "+namespace.Name+" still exists") }) It("scales up AKS node pools when pending Pods exist", func() { - ensureHelmValues(map[string]interface{}{ - "extraArgs": map[string]interface{}{ - "scale-down-delay-after-add": "10s", - "scale-down-unneeded-time": "10s", - "scale-down-candidates-pool-ratio": "1.0", - "unremovable-node-recheck-timeout": "10s", - "skip-nodes-with-system-pods": "false", - "skip-nodes-with-local-storage": "false", - }, - }) - nodes := &corev1.NodeList{} - Expect(k8s.List(ctx, nodes)).To(Succeed()) + Expect(env.K8s.List(env.Ctx, nodes)).To(Succeed()) nodeCountBefore := len(nodes.Items) By("Creating 100 Pods") - // https://raw.githubusercontent.com/kubernetes/website/main/content/en/examples/application/php-apache.yaml deploy := &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ Name: "php-apache", @@ -109,13 +153,13 @@ var _ = Describe("Azure Provider", func() { }, }, } - Expect(k8s.Create(ctx, deploy)).To(Succeed()) + Expect(env.K8s.Create(env.Ctx, deploy)).To(Succeed()) By("Waiting for more Ready Nodes to exist") Eventually(func() (int, error) { readyCount := 0 nodes := &corev1.NodeList{} - if err := k8s.List(ctx, nodes); err != nil { + if err := env.K8s.List(env.Ctx, nodes); err != nil { return 0, err } for _, node := range nodes.Items { @@ -129,15 +173,15 @@ var _ = Describe("Azure Provider", func() { return readyCount, nil }, "10m", "10s").Should(BeNumerically(">", nodeCountBefore)) - Eventually(allVMSSStable, "10m", "30s").Should(Succeed()) + Eventually(env.AllVMSSStable, "10m", "30s").Should(Succeed()) By("Deleting 100 Pods") - Expect(k8s.Delete(ctx, deploy)).To(Succeed()) + Expect(env.K8s.Delete(env.Ctx, deploy)).To(Succeed()) By("Waiting for the original number of Nodes to be Ready") Eventually(func(g Gomega) { nodes := &corev1.NodeList{} - g.Expect(k8s.List(ctx, nodes)).To(Succeed()) + g.Expect(env.K8s.List(env.Ctx, nodes)).To(Succeed()) g.Expect(nodes.Items).To(SatisfyAll( HaveLen(nodeCountBefore), ContainElements(Satisfy(func(node corev1.Node) bool {