diff --git a/CLAUDE.md b/CLAUDE.md index 2fb4b0141..749e4827a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -212,7 +212,8 @@ The operator binary runs in four mutually exclusive modes (controlled by flags i **Mutating Webhook** (`internal/controller/podplacement/scheduling_gate_mutating_webhook.go`): - Adds `multiarch.openshift.io/scheduling-gate` to new pods - Respects namespace selector from ClusterPodPlacementConfig -- Always excludes: `openshift-*`, `kube-*`, `hypershift-*` namespaces +- Always excludes: `kube-*` namespaces (hardcoded) +- Other namespaces (`openshift-*`, `hypershift-*`, etc.) can be included via namespaceSelector configuration - Uses worker pool for event publishing (ants library, 16 workers) **Image Inspector** (`pkg/image/inspector.go`): @@ -244,12 +245,15 @@ The operator binary runs in four mutually exclusive modes (controlled by flags i ### Namespace Exclusions -System namespaces are excluded from pod placement by default (cannot be overridden): -- `openshift-*` -- `kube-*` -- `hypershift-*` +**Hardcoded exclusions** (cannot be overridden): +- `kube-*` - Core Kubernetes namespaces are always excluded -Additional namespaces can be excluded via namespaceSelector with label `multiarch.openshift.io/exclude-pod-placement`. +**Configurable via namespaceSelector:** +- All other namespaces (including `openshift-*`, `hypershift-*`, user namespaces) can be included or excluded by configuring the `namespaceSelector` field in ClusterPodPlacementConfig +- Common exclusion pattern: add label `multiarch.openshift.io/exclude-pod-placement` to namespaces you want to skip +- Default example namespaceSelector excludes namespaces with this label + +The operator namespace is also always excluded from pod placement. ### Plugins System @@ -340,7 +344,7 @@ RUNTIME_IMAGE= # Override runtime base image ## Important Constraints - ClusterPodPlacementConfig must be named "cluster" (singleton enforced by webhook) -- Namespaces `openshift-*`, `kube-*`, and `hypershift-*` are always excluded from pod placement +- Only `kube-*` namespaces are hardcoded as always excluded from pod placement; other system namespaces (`openshift-*`, `hypershift-*`) can be included via namespaceSelector configuration - CGO is required for building (uses gpgme for registry authentication) - Only one execution mode flag can be set at a time in main.go - The operator uses vendored dependencies (`GOFLAGS=-mod=vendor`) diff --git a/api/v1beta1/clusterpodplacementconfig_types.go b/api/v1beta1/clusterpodplacementconfig_types.go index b22f60b60..9454e1ac2 100644 --- a/api/v1beta1/clusterpodplacementconfig_types.go +++ b/api/v1beta1/clusterpodplacementconfig_types.go @@ -64,6 +64,7 @@ type ClusterPodPlacementConfigStatus struct { Conditions []metav1.Condition `json:"conditions,omitempty"` // The following fields are used to derive the conditions. They are not exposed to the user. + // nolint:staticcheck,revive // controller-gen requires json tags on all fields, even unexported ones available bool `json:"-"` progressing bool `json:"-"` degraded bool `json:"-"` diff --git a/api/v1beta1/clusterpodplacementconfig_webhook.go b/api/v1beta1/clusterpodplacementconfig_webhook.go index 79cf33497..1fb372a3a 100644 --- a/api/v1beta1/clusterpodplacementconfig_webhook.go +++ b/api/v1beta1/clusterpodplacementconfig_webhook.go @@ -21,6 +21,8 @@ import ( "errors" "fmt" + "github.com/openshift/multiarch-tuning-operator/api/common" + "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -72,6 +74,9 @@ func (v *ClusterPodPlacementConfigValidator) validate(obj runtime.Object) (warni if !ok { return nil, errors.New("not a ClusterPodPlacementConfig") } + if cppc.Name != common.SingletonResourceObjectName { + return nil, fmt.Errorf("ClusterPodPlacementConfig must be named %q", common.SingletonResourceObjectName) + } if cppc.Spec.Plugins == nil || cppc.Spec.Plugins.NodeAffinityScoring == nil { return nil, nil } diff --git a/api/v1beta1/podplacementconfig_types.go b/api/v1beta1/podplacementconfig_types.go index 9354d5b88..3520cd61f 100644 --- a/api/v1beta1/podplacementconfig_types.go +++ b/api/v1beta1/podplacementconfig_types.go @@ -33,9 +33,8 @@ type PodPlacementConfigSpec struct { LabelSelector *metav1.LabelSelector `json:"labelSelector,omitempty"` // Plugins defines the configurable plugins for this component. - // This field is required. - // +kubebuilder:validation:Required - Plugins *plugins.LocalPlugins `json:"plugins"` + // +optional + Plugins *plugins.LocalPlugins `json:"plugins,omitempty"` // Priority defines the priority of the PodPlacementConfig and only accepts values in the range 0-255. // This field is optional and will default to 0 if not set. diff --git a/bundle/manifests/multiarch-tuning-operator.clusterserviceversion.yaml b/bundle/manifests/multiarch-tuning-operator.clusterserviceversion.yaml index 20e30a834..e59d05740 100644 --- a/bundle/manifests/multiarch-tuning-operator.clusterserviceversion.yaml +++ b/bundle/manifests/multiarch-tuning-operator.clusterserviceversion.yaml @@ -27,7 +27,7 @@ metadata: categories: OpenShift Optional, Other console.openshift.io/disable-operand-delete: "false" containerImage: registry.ci.openshift.org/origin/multiarch-tuning-operator:main - createdAt: "2026-03-02T21:50:07Z" + createdAt: "2026-04-14T00:14:34Z" features.operators.openshift.io/cnf: "false" features.operators.openshift.io/cni: "false" features.operators.openshift.io/csi: "false" diff --git a/bundle/manifests/multiarch.openshift.io_podplacementconfigs.yaml b/bundle/manifests/multiarch.openshift.io_podplacementconfigs.yaml index 8bfea17b2..60e4d790c 100644 --- a/bundle/manifests/multiarch.openshift.io_podplacementconfigs.yaml +++ b/bundle/manifests/multiarch.openshift.io_podplacementconfigs.yaml @@ -90,9 +90,7 @@ spec: type: object x-kubernetes-map-type: atomic plugins: - description: |- - Plugins defines the configurable plugins for this component. - This field is required. + description: Plugins defines the configurable plugins for this component. properties: nodeAffinityScoring: description: NodeAffinityScoring is the plugin that implements @@ -144,8 +142,6 @@ spec: maximum: 255 minimum: 0 type: integer - required: - - plugins type: object status: description: PodPlacementConfigStatus defines the observed state of PodPlacementConfig diff --git a/cmd/enoexec-daemon/main.go b/cmd/enoexec-daemon/main.go index 3d01206c1..1996bee90 100644 --- a/cmd/enoexec-daemon/main.go +++ b/cmd/enoexec-daemon/main.go @@ -30,16 +30,19 @@ func main() { } func bindFlags() { - flag.IntVar(&initialLogLevel, "initial-log-level", 0, "Initial log level. From 0 (Normal) to 5 (TraceAll)") + flag.IntVar(&initialLogLevel, "initial-log-level", 0, "Initial log level. From 0 (Normal) to 10 (maximum verbosity)") flag.BoolVar(&logDevMode, "log-dev-mode", false, "Enable development mode for zap logger") flag.Parse() } func initContext() (context.Context, context.CancelFunc) { ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + if initialLogLevel < 0 || initialLogLevel > 10 { + must(fmt.Errorf("initial-log-level must be in range [0,10], got %d", initialLogLevel), "invalid flag value") + } var logImpl *zap.Logger var err error - logLevel := zapcore.Level(int8(-initialLogLevel)) // #nosec G115 -- initialLogLevel is constrained to 0-5 range + logLevel := zapcore.Level(-initialLogLevel) if logDevMode { cfg := zap.NewDevelopmentConfig() cfg.Level = zap.NewAtomicLevelAt(logLevel) diff --git a/cmd/main.go b/cmd/main.go index 519a2307b..65ab673d4 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -297,6 +297,9 @@ func validateFlags() error { if btoi(enableOperator)+btoi(enableClusterPodPlacementConfigOperandControllers)+btoi(enableClusterPodPlacementConfigOperandWebHook)+btoi(enableENoExecEventControllers) > 1 { return errors.New("only one of the following flags can be set: --enable-operator, --enable-ppc-controllers, --enable-ppc-webhook, --enable-enoexec-event-controllers") } + if initialLogLevel < 0 || initialLogLevel > 10 { + return fmt.Errorf("initial-log-level must be in range [0,10], got %d", initialLogLevel) + } return nil } @@ -319,11 +322,13 @@ func bindFlags() { // This may be deprecated in the future. It is used to support the current way of setting the log level for operands // If operands will start to support a controller that watches the ClusterPodPlacementConfig, this flag may be removed // and the log level will be set in the ClusterPodPlacementConfig at runtime (with no need for reconciliation) - flag.IntVar(&initialLogLevel, "initial-log-level", common.LogVerbosityLevelNormal.ToZapLevelInt(), "Initial log level. Converted to zap") + flag.IntVar(&initialLogLevel, "initial-log-level", common.LogVerbosityLevelNormal.ToZapLevelInt(), "Initial log level (0-10). Converted to zap. CRD levels: 0=Normal, 1=Debug, 2=Trace, 3=TraceAll") klog.InitFlags(nil) flag.Parse() // Set the Log Level as AtomicLevel to allow runtime changes - utils.AtomicLevel = zapuber.NewAtomicLevelAt(zapcore.Level(int8(-initialLogLevel))) // #nosec G115 -- initialLogLevel is constrained to 0-3 range + // Safe to convert: initialLogLevel is validated to be in range [0,10] by validateFlags(), + // so -initialLogLevel is in range [-10,0] which fits in int8 range [-128,127] + utils.AtomicLevel = zapuber.NewAtomicLevelAt(zapcore.Level(int8(-initialLogLevel))) //nolint:gosec // G115 - conversion is safe, value validated to be in range zapLogger := zap.New(zap.Level(utils.AtomicLevel), zap.UseDevMode(false)) klog.SetLogger(zapLogger) ctrllog.SetLogger(zapLogger) diff --git a/config/crd/bases/multiarch.openshift.io_podplacementconfigs.yaml b/config/crd/bases/multiarch.openshift.io_podplacementconfigs.yaml index 7780eee67..306270cb3 100644 --- a/config/crd/bases/multiarch.openshift.io_podplacementconfigs.yaml +++ b/config/crd/bases/multiarch.openshift.io_podplacementconfigs.yaml @@ -90,9 +90,7 @@ spec: type: object x-kubernetes-map-type: atomic plugins: - description: |- - Plugins defines the configurable plugins for this component. - This field is required. + description: Plugins defines the configurable plugins for this component. properties: nodeAffinityScoring: description: NodeAffinityScoring is the plugin that implements @@ -144,8 +142,6 @@ spec: maximum: 255 minimum: 0 type: integer - required: - - plugins type: object status: description: PodPlacementConfigStatus defines the observed state of PodPlacementConfig diff --git a/hack/upgrade-automation/README.md b/hack/upgrade-automation/README.md index 4924b9412..793869c02 100644 --- a/hack/upgrade-automation/README.md +++ b/hack/upgrade-automation/README.md @@ -79,11 +79,10 @@ Updates all Go dependencies (smart handling skips incompatible versions): Re-vendors all dependencies: 1. `go mod tidy` -2. `rm -rf vendor/` -3. `go mod vendor` -4. Restores go directive if `go mod tidy` upgraded it +2. `make vendor` +3. Restores go directive if `go mod tidy` upgraded it -**Commit:** `go mod vendor` +**Commit:** `make vendor` ### Step 5: Run code generation @@ -220,7 +219,7 @@ sed_inplace() { | `validate_prerequisites()` | All above checks | Exit on any failure | **Branch naming:** -``` +```text upgrade-ocp-{ocp}-go-{go_minor}-k8s-{k8s_minor} Example: upgrade-ocp-4.20-go-1.24-k8s-1.34 @@ -277,19 +276,19 @@ hack/upgrade-automation/scripts/upgrade.sh 4.20 1.24 1.34.1 **Common causes and fixes:** **API deprecation:** -``` +```text Error: undefined: corev1.SomeOldAPI ``` Solution: Update code to use new API (check K8s release notes) **Test helper changes:** -``` +```text Error: cannot use X (type Y) as type Z ``` Solution: Update test setup code for new types **Import path changes:** -``` +```text Error: package X is not in GOROOT ``` Solution: Update import paths (check go.mod for correct versions) diff --git a/hack/upgrade-automation/scripts/lib/validations.sh b/hack/upgrade-automation/scripts/lib/validations.sh index cc512d966..00b2741ec 100755 --- a/hack/upgrade-automation/scripts/lib/validations.sh +++ b/hack/upgrade-automation/scripts/lib/validations.sh @@ -69,8 +69,8 @@ validate_and_create_branch() { read -p "Do you want to delete and recreate it? (y/N) " -n 1 -r >&2 echo "" >&2 if [[ $REPLY =~ ^[Yy]$ ]]; then - git branch -D "$branch_name" - git checkout -b "$branch_name" + # Use checkout -B to force recreation (works even if currently checked out) + git checkout -B "$branch_name" else echo "ERROR: Branch '$branch_name' already exists. Please delete it or choose a different version." >&2 return 1 @@ -124,12 +124,12 @@ validate_k8s_version_consistency() { echo "Validating k8s.io version consistency in go.mod..." >&2 local versions_count - versions_count=$(grep '^\s*k8s\.io/' go.mod | grep -v '//' | awk '{print $2}' | grep -oP 'v\d+\.\d+' | sort -u | wc -l) + versions_count=$(grep '^\s*k8s\.io/' go.mod | grep -v '//' | awk '{print $2}' | sed -E 's/(v[0-9]+\.[0-9]+).*/\1/' | sort -u | wc -l) if [[ "$versions_count" -gt 1 ]]; then echo "⚠️ WARNING: Multiple k8s.io minor versions detected in go.mod" >&2 echo " Found versions:" >&2 - grep '^\s*k8s\.io/' go.mod | grep -v '//' | awk '{print $1, $2}' | grep -oP 'v\d+\.\d+' | sort -u >&2 + grep '^\s*k8s\.io/' go.mod | grep -v '//' | awk '{print $1, $2}' | sed -E 's/.*(v[0-9]+\.[0-9]+).*/\1/' | sort -u >&2 echo " This is expected for packages like k8s.io/klog and k8s.io/utils" >&2 else echo "✅ All k8s.io dependencies at consistent minor version" >&2 @@ -160,7 +160,7 @@ validate_prerequisites() { local current_go current_k8s current_ocp current_go=$(grep '^go ' go.mod | awk '{print $2}') current_k8s=$(grep 'k8s.io/api' go.mod | head -1 | awk '{print $2}') - current_ocp=$(grep 'BUILD_IMAGE' Makefile | grep -oP 'openshift-\K[0-9]+\.[0-9]+') + current_ocp=$(grep 'BUILD_IMAGE' Makefile | sed -E 's/.*openshift-([0-9]+\.[0-9]+).*/\1/') echo "Current versions:" >&2 echo " Go: $current_go" >&2 @@ -172,5 +172,17 @@ validate_prerequisites() { echo " Kubernetes: $k8s_version" >&2 echo "" >&2 + # Validate golang image exists + if ! validate_golang_image_exists "$go_version"; then + return 1 + fi + + echo "" >&2 + + # Validate k8s version consistency (warning only) + validate_k8s_version_consistency + + echo "" >&2 + return 0 } diff --git a/hack/upgrade-automation/scripts/lib/version-discovery.sh b/hack/upgrade-automation/scripts/lib/version-discovery.sh index cee7649e5..2f78b8fba 100755 --- a/hack/upgrade-automation/scripts/lib/version-discovery.sh +++ b/hack/upgrade-automation/scripts/lib/version-discovery.sh @@ -23,12 +23,13 @@ discover_k8s_from_ocp_release() { echo "$k8s_version" else # Fallback: discover from openshift/api go.mod - local k8s_minor - k8s_minor=$(curl -sf "https://raw.githubusercontent.com/openshift/api/release-$ocp_version/go.mod" | \ - grep 'k8s.io/api ' | awk '{print $2}' | grep -oP 'v0\.\K[0-9]+') + local k8s_module_version + k8s_module_version=$(curl -sf "https://raw.githubusercontent.com/openshift/api/release-$ocp_version/go.mod" | \ + grep 'k8s.io/api ' | awk '{print $2}' | sed -E -n '/^v0\.[0-9]+\.[0-9]+$/p') - if [[ -n "$k8s_minor" ]]; then - echo "1.$k8s_minor.0" + if [[ -n "$k8s_module_version" ]]; then + # Transform v0.X.Y to 1.X.Y (preserving patch version) + echo "${k8s_module_version/v0./1.}" else echo "" fi @@ -74,7 +75,7 @@ discover_required_ocp_version() { # Check each branch to find which uses our target K8s version for ocp_version in $branches; do local openshift_api_k8s - openshift_api_k8s=$(curl -sf "https://raw.githubusercontent.com/openshift/api/release-$ocp_version/go.mod" | grep 'k8s.io/api ' | awk '{print $2}' | grep -oP 'v0\.\K[0-9]+') + openshift_api_k8s=$(curl -sf "https://raw.githubusercontent.com/openshift/api/release-$ocp_version/go.mod" | grep 'k8s.io/api ' | awk '{print $2}' | sed -E 's/^v0\.([0-9]+).*/\1/') if [[ "$openshift_api_k8s" == "$k8s_minor" ]]; then echo "✅ Found OCP $ocp_version uses K8s 1.$k8s_minor" >&2 @@ -113,7 +114,7 @@ validate_k8s_ocp_compatibility() { # Check openshift/api release branch for K8s version local openshift_api_k8s - openshift_api_k8s=$(curl -sf "https://raw.githubusercontent.com/openshift/api/release-$ocp_version/go.mod" | grep 'k8s.io/api ' | awk '{print $2}' | grep -oP 'v0\.\K[0-9]+') + openshift_api_k8s=$(curl -sf "https://raw.githubusercontent.com/openshift/api/release-$ocp_version/go.mod" | grep 'k8s.io/api ' | awk '{print $2}' | sed -E 's/^v0\.([0-9]+).*/\1/') if [[ "$openshift_api_k8s" != "$k8s_minor" ]]; then echo "ERROR: K8s version mismatch" >&2 @@ -139,7 +140,7 @@ discover_controller_runtime_version() { echo "Discovering compatible controller-runtime version..." >&2 local releases - releases=$(curl -s https://api.github.com/repos/kubernetes-sigs/controller-runtime/releases | grep '"tag_name"' | grep -E '"v0\.' | sed -E 's/.*"v([^"]+)".*/\1/') + releases=$(curl -s https://api.github.com/repos/kubernetes-sigs/controller-runtime/releases | grep '"tag_name"' | grep -E '"v0\.' | grep -Ev -- '-(alpha|beta|rc)[0-9.]*"' | sed -E 's/.*"v([^"]+)".*/\1/') for version in $releases; do local gomod @@ -150,7 +151,7 @@ discover_controller_runtime_version() { fi local cr_k8s_minor cr_go_minor - cr_k8s_minor=$(echo "$gomod" | grep 'k8s.io/apimachinery' | awk '{print $2}' | grep -oP 'v0\.\K[0-9]+' | head -1) + cr_k8s_minor=$(echo "$gomod" | grep 'k8s.io/apimachinery' | awk '{print $2}' | sed -E 's/^v0\.([0-9]+).*/\1/' | head -1) cr_go_minor=$(echo "$gomod" | grep '^go ' | awk '{print $2}' | cut -d. -f2) if [[ "$cr_k8s_minor" == "$k8s_minor" ]] && [[ "$cr_go_minor" -le "$go_minor" ]]; then @@ -200,13 +201,13 @@ discover_controller_tools_version() { local k8s_minor go_minor # Extract minor version from k8s_version parameter (e.g., "1.34.1" -> "34") - k8s_minor=$(echo "$k8s_version" | grep -oP '1\.\K[0-9]+') + k8s_minor=$(echo "$k8s_version" | sed -E 's/^1\.([0-9]+).*/\1/') go_minor=$(echo "$go_version" | cut -d. -f2) echo "Discovering compatible controller-tools version..." >&2 local releases - releases=$(curl -s https://api.github.com/repos/kubernetes-sigs/controller-tools/releases | grep '"tag_name"' | grep -E '"v0\.' | sed -E 's/.*"v([^"]+)".*/\1/') + releases=$(curl -s https://api.github.com/repos/kubernetes-sigs/controller-tools/releases | grep '"tag_name"' | grep -E '"v0\.' | grep -Ev -- '-(alpha|beta|rc)[0-9.]*"' | sed -E 's/.*"v([^"]+)".*/\1/') for version in $releases; do local gomod @@ -217,7 +218,7 @@ discover_controller_tools_version() { fi local ct_k8s_minor ct_go_minor - ct_k8s_minor=$(echo "$gomod" | grep 'k8s.io/apimachinery' | awk '{print $2}' | grep -oP 'v0\.\K[0-9]+' | head -1) + ct_k8s_minor=$(echo "$gomod" | grep 'k8s.io/apimachinery' | awk '{print $2}' | sed -E 's/^v0\.([0-9]+).*/\1/' | head -1) ct_go_minor=$(echo "$gomod" | grep '^go ' | awk '{print $2}' | cut -d. -f2) if [[ "$ct_k8s_minor" == "$k8s_minor" ]] && [[ "$ct_go_minor" -le "$go_minor" ]]; then @@ -292,7 +293,7 @@ discover_golangci_lint_version() { echo "Discovering compatible golangci-lint version..." >&2 local releases - releases=$(curl -s https://api.github.com/repos/golangci/golangci-lint/releases | grep '"tag_name"' | sed -E 's/.*"v([^"]+)".*/\1/') + releases=$(curl -s https://api.github.com/repos/golangci/golangci-lint/releases | grep '"tag_name"' | grep -Ev -- '-(alpha|beta|rc)[0-9.]*"' | sed -E 's/.*"v([^"]+)".*/\1/') for version in $releases; do local go_req diff --git a/hack/upgrade-automation/scripts/upgrade.sh b/hack/upgrade-automation/scripts/upgrade.sh index 3420f217b..995620b6f 100755 --- a/hack/upgrade-automation/scripts/upgrade.sh +++ b/hack/upgrade-automation/scripts/upgrade.sh @@ -117,11 +117,14 @@ step_update_base_images() { echo "" >&2 # Commit changes - echo "Committing changes..." >&2 git add .ci-operator.yaml .tekton/ Dockerfile Makefile bundle.konflux.Dockerfile konflux.Dockerfile 2>/dev/null || true - git commit -m "Update Makefile and Dockerfiles to use the new Golang version base image to ${go_minor}" - - echo "✅ Step 1 complete" >&2 + if [[ -n "$(git status --porcelain)" ]]; then + echo "Committing changes..." >&2 + git commit -m "Update Makefile and Dockerfiles to use the new Golang version base image to ${go_minor}" + echo "✅ Step 1 complete (changes committed)" >&2 + else + echo "✅ Step 1 complete (no changes)" >&2 + fi } # ============================================================================== @@ -147,11 +150,14 @@ step_update_tools() { "$golint_version" echo "" >&2 - echo "Committing changes..." >&2 git add Makefile - git commit -m "Update tools in Makefile" - - echo "✅ Step 2 complete" >&2 + if [[ -n "$(git status --porcelain)" ]]; then + echo "Committing changes..." >&2 + git commit -m "Update tools in Makefile" + echo "✅ Step 2 complete (changes committed)" >&2 + else + echo "✅ Step 2 complete (no changes)" >&2 + fi } # ============================================================================== @@ -412,7 +418,8 @@ step_update_go_mod() { echo "⚠️ Found $require_count require blocks, consolidating to 2..." >&2 # Create temporary file to rebuild go.mod - local temp_gomod=$(mktemp) + local temp_gomod + temp_gomod=$(mktemp) || { echo "ERROR: Failed to create temporary file" >&2; return 1; } # Copy everything before first require block awk '/^require \(/{exit} {print}' go.mod > "$temp_gomod" @@ -510,11 +517,8 @@ step_update_vendor() { echo "" >&2 - echo "Removing existing vendor directory..." >&2 - rm -rf vendor/ - - echo "Running go mod vendor..." >&2 - go mod vendor + echo "Running make vendor..." >&2 + make vendor # After vendoring is complete, restore the go directive to match the container version # The vendor directory doesn't depend on the go directive - only builds do @@ -523,10 +527,10 @@ step_update_vendor() { if [[ "$actual_go_version" != "$go_version" ]]; then echo "" >&2 - echo "⚠️ Go directive was upgraded to $actual_go_version (required by dependencies)" >&2 - echo " Restoring to container version $go_version for build compatibility..." >&2 - update_go_mod_directive "$go_version" - echo "✅ Restored go directive to $go_version" >&2 + echo "❌ ERROR: vendoring resolved dependencies that require Go $actual_go_version" >&2 + echo " The selected dependencies require a higher Go version than the container ($go_version)." >&2 + echo " Refusing to rewrite go.mod to a lower version." >&2 + return 1 else echo "✅ go directive is already at container version: $go_version" >&2 fi @@ -608,6 +612,14 @@ step_run_tests() { fi echo "✅ build passed" >&2 + echo "" >&2 + echo "Running code quality auto-fixes..." >&2 + echo "Running make fmt..." >&2 + make fmt || true + echo "Running make goimports..." >&2 + make goimports || true + echo "✅ auto-fixes complete" >&2 + echo "" >&2 echo "Running make test..." >&2 if ! make test; then diff --git a/internal/controller/enoexecevent/handler/enoexecevent_controller.go b/internal/controller/enoexecevent/handler/enoexecevent_controller.go index 50752be4b..96ea150f7 100644 --- a/internal/controller/enoexecevent/handler/enoexecevent_controller.go +++ b/internal/controller/enoexecevent/handler/enoexecevent_controller.go @@ -21,6 +21,7 @@ import ( runtime2 "runtime" v1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/record" @@ -94,11 +95,11 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu // Log the ENoExecEvent instance logger.Info("Reconciling ENoExecEvent", "name", eNoExecEvent.Name, "namespace", eNoExecEvent.Namespace) ret, err := r.reconcile(ctx, eNoExecEvent) - // If the reconciliation was successful, or one of the objects was not found, we delete the ENoExecEvent resource. - if client.IgnoreNotFound(err) == nil { + + // If the reconciliation was successful, delete the ENoExecEvent resource. + if err == nil { if err := r.Delete(ctx, &eNoExecEvent.ENoExecEvent); err != nil { logger.Error(err, "Failed to delete ENoExecEvent resource after reconciliation", "name", eNoExecEvent.Name) - // Mark as error but return the original error so client.IgnoreNotFound works r.markAsError(ctx, eNoExecEvent, ErrorReasonReconciliation) return ctrl.Result{}, client.IgnoreNotFound(err) } @@ -106,9 +107,22 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu logger.Info("Deleted ENoExecEvent resource after successful reconciliation", "name", eNoExecEvent.Name) return ret, nil } + + // If pod/node not found, this is a stale event (object was deleted before processing) + if apierrors.IsNotFound(err) { + if err := r.Delete(ctx, &eNoExecEvent.ENoExecEvent); err != nil { + logger.Error(err, "Failed to delete stale ENoExecEvent resource", "name", eNoExecEvent.Name) + r.markAsError(ctx, eNoExecEvent, ErrorReasonReconciliation) + return ctrl.Result{}, client.IgnoreNotFound(err) + } + metrics.EnoexecCounterStale.Inc() + logger.Info("Deleted stale ENoExecEvent resource (pod/node not found)", "name", eNoExecEvent.Name) + return ret, nil + } + + // Real reconciliation error that will be retried metrics.EnoexecCounterInvalid.Inc() logger.Error(err, "Failed to reconcile ENoExecEvent", "name", eNoExecEvent.Name) - // Mark as error - this is a real reconciliation error that will be retried r.markAsError(ctx, eNoExecEvent, ErrorReasonReconciliation) return ctrl.Result{}, err } diff --git a/internal/controller/enoexecevent/handler/metrics/metrics.go b/internal/controller/enoexecevent/handler/metrics/metrics.go index b223cbd89..68cae19b4 100644 --- a/internal/controller/enoexecevent/handler/metrics/metrics.go +++ b/internal/controller/enoexecevent/handler/metrics/metrics.go @@ -10,6 +10,7 @@ import ( var EnoexecCounter prometheus.Counter var EnoexecCounterInvalid prometheus.Counter +var EnoexecCounterStale prometheus.Counter var onceCommon sync.Once func initMetrics() { @@ -23,11 +24,18 @@ func initMetrics() { EnoexecCounterInvalid = prometheus.NewCounter( prometheus.CounterOpts{ Name: "mto_enoexecevents_invalid_total", - Help: "The counter for ENoExecEvents objects that faled the reconciliation and report as pod events", + Help: "The counter for ENoExecEvents objects that failed reconciliation and were reported as pod events", + }, + ) + EnoexecCounterStale = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "mto_enoexecevents_stale_total", + Help: "The counter for ENoExecEvents referencing pods/nodes that no longer exist", }, ) metrics2.Registry.MustRegister(EnoexecCounter) metrics2.Registry.MustRegister(EnoexecCounterInvalid) + metrics2.Registry.MustRegister(EnoexecCounterStale) }) } diff --git a/internal/controller/operator/clusterpodplacementconfig_controller.go b/internal/controller/operator/clusterpodplacementconfig_controller.go index c79f93bc4..c5383e9e5 100644 --- a/internal/controller/operator/clusterpodplacementconfig_controller.go +++ b/internal/controller/operator/clusterpodplacementconfig_controller.go @@ -582,7 +582,11 @@ func (r *ClusterPodPlacementConfigReconciler) handleEnoexecDelete(ctx context.Co } // Delete errored ENoExecEvent resources and count remaining non-errored ones - nonErroredCount, _ := r.deleteErroredENoExecEvents(ctx, enoexecEventList) + nonErroredCount, _, err := r.deleteErroredENoExecEvents(ctx, enoexecEventList) + if err != nil { + log.Error(err, "Failed to delete errored ENoExecEvent resources") + return err + } // Only block cleanup if there are non-errored ENoExecEvent resources if nonErroredCount > 0 { @@ -941,9 +945,10 @@ func isDeploymentUpToDate(deployment *appsv1.Deployment) bool { } // deleteErroredENoExecEvents deletes ENoExecEvent resources that are marked with an error label. -// Returns the count of non-errored and errored events found. -func (r *ClusterPodPlacementConfigReconciler) deleteErroredENoExecEvents(ctx context.Context, enoexecEventList *multiarchv1beta1.ENoExecEventList) (nonErroredCount, erroredCount int) { +// Returns the count of non-errored and errored events found, and any delete errors encountered. +func (r *ClusterPodPlacementConfigReconciler) deleteErroredENoExecEvents(ctx context.Context, enoexecEventList *multiarchv1beta1.ENoExecEventList) (nonErroredCount, erroredCount int, err error) { log := ctrllog.FromContext(ctx) + var errs []error for i := range enoexecEventList.Items { enoexecEvent := &enoexecEventList.Items[i] // Check if this ENoExecEvent is marked as errored @@ -959,6 +964,7 @@ func (r *ClusterPodPlacementConfigReconciler) deleteErroredENoExecEvents(ctx con erroredCount++ if deleteErr := r.Delete(ctx, enoexecEvent); deleteErr != nil { log.Error(deleteErr, "Failed to delete errored ENoExecEvent", "name", enoexecEvent.Name) + errs = append(errs, deleteErr) } else { log.V(1).Info("Deleted errored ENoExecEvent", "name", enoexecEvent.Name) } @@ -967,7 +973,7 @@ func (r *ClusterPodPlacementConfigReconciler) deleteErroredENoExecEvents(ctx con if erroredCount > 0 { log.Info("Deleted errored ENoExecEvent resources during cleanup", "erroredCount", erroredCount) } - return nonErroredCount, erroredCount + return nonErroredCount, erroredCount, errorutils.NewAggregate(errs) } // SetupWithManager sets up the controller with the Manager. diff --git a/internal/controller/operator/clusterpodplacementconfig_controller_test.go b/internal/controller/operator/clusterpodplacementconfig_controller_test.go index 4d9c0ec14..16537eb34 100644 --- a/internal/controller/operator/clusterpodplacementconfig_controller_test.go +++ b/internal/controller/operator/clusterpodplacementconfig_controller_test.go @@ -392,7 +392,7 @@ var _ = Describe("internal/Controller/ClusterPodPlacementConfig/ClusterPodPlacem framework.NewConditionTypeStatusTuple(v1beta1.MutatingWebhookConfigurationNotAvailable, corev1.ConditionTrue), framework.NewConditionTypeStatusTuple(v1beta1.DeprovisioningType, corev1.ConditionTrue), ) - }) + }).Should(Succeed(), "CPPC should remain in deprovisioning state while gated pod exists") By("Manually delete the gated pod") err = k8sClient.Delete(ctx, pod) Expect(err).NotTo(HaveOccurred(), "failed to delete pod", err) @@ -983,7 +983,8 @@ var _ = Describe("internal/Controller/ClusterPodPlacementConfig/ClusterPodPlacem Expect(k8sClient.List(ctx, enoexecEventList, crclient.InNamespace(utils.Namespace()))).To(Succeed()) By("Calling deleteErroredENoExecEvents") - nonErroredCount, erroredCount := reconciler.deleteErroredENoExecEvents(ctx, enoexecEventList) + nonErroredCount, erroredCount, err := reconciler.deleteErroredENoExecEvents(ctx, enoexecEventList) + Expect(err).NotTo(HaveOccurred()) By("Verifying the counts") Expect(nonErroredCount).To(Equal(2), "should have 2 non-errored events") @@ -1005,7 +1006,7 @@ var _ = Describe("internal/Controller/ClusterPodPlacementConfig/ClusterPodPlacem }).Should(Succeed()) By("Verifying non-errored events still exist") - err := k8sClient.Get(ctx, crclient.ObjectKey{ + err = k8sClient.Get(ctx, crclient.ObjectKey{ Name: nonErroredENEE1.Name, Namespace: utils.Namespace(), }, &v1beta1.ENoExecEvent{}) @@ -1024,7 +1025,8 @@ var _ = Describe("internal/Controller/ClusterPodPlacementConfig/ClusterPodPlacem It("should handle empty list gracefully", func() { emptyList := &v1beta1.ENoExecEventList{} - nonErroredCount, erroredCount := reconciler.deleteErroredENoExecEvents(ctx, emptyList) + nonErroredCount, erroredCount, err := reconciler.deleteErroredENoExecEvents(ctx, emptyList) + Expect(err).NotTo(HaveOccurred()) Expect(nonErroredCount).To(Equal(0)) Expect(erroredCount).To(Equal(0)) }) @@ -1040,7 +1042,8 @@ var _ = Describe("internal/Controller/ClusterPodPlacementConfig/ClusterPodPlacem enoexecEventList := &v1beta1.ENoExecEventList{} Expect(k8sClient.List(ctx, enoexecEventList, crclient.InNamespace(utils.Namespace()))).To(Succeed()) - nonErroredCount, erroredCount := reconciler.deleteErroredENoExecEvents(ctx, enoexecEventList) + nonErroredCount, erroredCount, err := reconciler.deleteErroredENoExecEvents(ctx, enoexecEventList) + Expect(err).NotTo(HaveOccurred()) Expect(nonErroredCount).To(BeNumerically(">", 0), "should count events with no labels as non-errored") Expect(erroredCount).To(Equal(0)) diff --git a/internal/controller/operator/objects.go b/internal/controller/operator/objects.go index 04dcf6d06..934cdd38b 100644 --- a/internal/controller/operator/objects.go +++ b/internal/controller/operator/objects.go @@ -339,11 +339,13 @@ func buildServiceMonitor(name string) *monitoringv1.ServiceMonitor { Spec: monitoringv1.ServiceMonitorSpec{ Endpoints: []monitoringv1.Endpoint{ { - HonorLabels: true, - Path: "/metrics", - Port: "metrics", - Scheme: "https", - BearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token", + HonorLabels: true, + Path: "/metrics", + Port: "metrics", + Scheme: "https", + Authorization: &monitoringv1.SafeAuthorization{ + Type: "Bearer", + }, TLSConfig: &monitoringv1.TLSConfig{ CAFile: "/etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt", SafeTLSConfig: monitoringv1.SafeTLSConfig{ diff --git a/internal/controller/operator/suite_test.go b/internal/controller/operator/suite_test.go index 1f72ea51d..7996a7fff 100644 --- a/internal/controller/operator/suite_test.go +++ b/internal/controller/operator/suite_test.go @@ -167,7 +167,7 @@ func applyResources(resources resmap.ResMap) error { Expect(err).NotTo(HaveOccurred()) if len(raw) == 0 { - return nil // Nothing to process + continue // Skip empty resource, continue processing others } // Decode the resource from the buffer diff --git a/internal/controller/podplacement/pod_model.go b/internal/controller/podplacement/pod_model.go index 5b46934c0..2d91248c8 100644 --- a/internal/controller/podplacement/pod_model.go +++ b/internal/controller/podplacement/pod_model.go @@ -436,7 +436,7 @@ func (pod *Pod) shouldIgnorePod(cppc *v1beta1.ClusterPodPlacementConfig, matchin pod.Spec.NodeName != "" || pod.HasControlPlaneNodeSelector() || pod.IsFromDaemonSet() || pod.isNodeSelectorConfiguredForArchitecture() && (pod.isPreferredAffinityConfiguredForArchitecture() || - (!cppc.PluginsEnabled(common.NodeAffinityScoringPluginName) && !pod.hasMatchingPPCWithPlugin(matchingPPCs))) + ((cppc == nil || !cppc.PluginsEnabled(common.NodeAffinityScoringPluginName)) && !pod.hasMatchingPPCWithPlugin(matchingPPCs))) } // isNodeSelectorConfiguredForArchitecture returns true if the pod has already a nodeSelector for the architecture label diff --git a/internal/controller/podplacement/pod_reconciler.go b/internal/controller/podplacement/pod_reconciler.go index 2b97a1c28..aedd4c19c 100644 --- a/internal/controller/podplacement/pod_reconciler.go +++ b/internal/controller/podplacement/pod_reconciler.go @@ -235,7 +235,11 @@ func (r *PodReconciler) trackSkippedMatchingConfigs(ctx context.Context, pod *Po } } -// pullSecretDataList returns the list of secrets data for the given pod given its imagePullSecrets field +// pullSecretDataList returns the list of secrets data for the given pod given its imagePullSecrets field. +// This function implements fail-open behavior: it logs errors but does not fail if secrets cannot be +// retrieved or parsed. The function returns whatever secret data was successfully collected, even if that's +// an empty list. This allows image inspection to proceed without authentication if necessary, rather than +// blocking pod scheduling due to transient pull secret issues. func (r *PodReconciler) pullSecretDataList(ctx context.Context, pod *Pod) ([][]byte, error) { log := ctrllog.FromContext(ctx) secretAuths := make([][]byte, 0) diff --git a/internal/controller/podplacement/pod_reconciler_test.go b/internal/controller/podplacement/pod_reconciler_test.go index 9af6c0184..4df61e59b 100644 --- a/internal/controller/podplacement/pod_reconciler_test.go +++ b/internal/controller/podplacement/pod_reconciler_test.go @@ -120,6 +120,20 @@ var _ = Describe("Internal/Controller/Podplacement/PodReconciler", func() { cppc := &v1beta1.ClusterPodPlacementConfig{} err := k8sClient.Get(ctx, crclient.ObjectKey{Name: common.SingletonResourceObjectName}, cppc) Expect(err).NotTo(HaveOccurred(), "failed to get ClusterPodPlacementConfig") + originalFallback := cppc.Spec.FallbackArchitecture + DeferCleanup(func() { + latest := &v1beta1.ClusterPodPlacementConfig{} + Expect(k8sClient.Get(ctx, crclient.ObjectKey{Name: common.SingletonResourceObjectName}, latest)).To(Succeed()) + latest.Spec.FallbackArchitecture = originalFallback + Expect(k8sClient.Update(ctx, latest)).To(Succeed()) + Eventually(func() string { + cppc := clusterpodplacementconfig.GetClusterPodPlacementConfig() + if cppc == nil { + return "nil" + } + return cppc.Spec.FallbackArchitecture + }).Should(Equal(originalFallback), "cache did not update with reverted ClusterPodPlacementConfig") + }) cppc.Spec.FallbackArchitecture = utils.ArchitectureAmd64 err = k8sClient.Update(ctx, cppc) Expect(err).NotTo(HaveOccurred(), "failed to update ClusterPodPlacementConfig") @@ -173,26 +187,14 @@ var _ = Describe("Internal/Controller/Podplacement/PodReconciler", func() { g.Expect(pod.Labels).To(HaveKeyWithValue(utils.NodeAffinityLabel, utils.NodeAffinityLabelValueSet), "node affinity label not found") }).WithTimeout(e2e.WaitShort).Should(Succeed(), "failed to process fallback architecture") - - // Cleanup: Revert CPPC changes - cppc.Spec.FallbackArchitecture = "" - err = k8sClient.Update(ctx, cppc) - Expect(err).NotTo(HaveOccurred(), "failed to revert ClusterPodPlacementConfig") - Eventually(func() string { - cppc := clusterpodplacementconfig.GetClusterPodPlacementConfig() - if cppc == nil { - return "nil" - } - return cppc.Spec.FallbackArchitecture - }).Should(Equal(""), "cache did not update with reverted ClusterPodPlacementConfig") }) }) Context("with different pull secrets", func() { - It("handles images with global pull secrets correctly", func() { + PIt("handles images with global pull secrets correctly", func() { // TODO: Test logic for handling a Pod with one container and image using global pull secret }) - It("handles images with local pull secrets correctly", func() { + PIt("handles images with local pull secrets correctly", func() { // TODO: Test logic for handling a Pod with one container and image using local pull secret }) }) @@ -417,11 +419,11 @@ var _ = Describe("Internal/Controller/Podplacement/PodReconciler", func() { }) When("Handling Multi-container Pods", func() { Context("with different image types and different auth credentials sources", func() { - It("handles node affinity as the intersection of the compatible architectures of each multi-arch image", func() { + PIt("handles node affinity as the intersection of the compatible architectures of each multi-arch image", func() { // TODO: Test logic for handling a Pod with multiple multi-arch image-based containers }) - It("handles node affinity of multi-arch images and single-arch image setting the only one possible", func() { + PIt("handles node affinity of multi-arch images and single-arch image setting the only one possible", func() { // TODO: Test logic for handling a Pod with multiple multi-arch image-based containers }) diff --git a/internal/controller/podplacement/scheduling_gate_mutating_webhook.go b/internal/controller/podplacement/scheduling_gate_mutating_webhook.go index cbb317f1c..362920025 100644 --- a/internal/controller/podplacement/scheduling_gate_mutating_webhook.go +++ b/internal/controller/podplacement/scheduling_gate_mutating_webhook.go @@ -85,8 +85,9 @@ func (a *PodSchedulingGateMutatingWebHook) Handle(ctx context.Context, req admis ppcList := &multiarchv1beta1.PodPlacementConfigList{} if err := a.client.List(ctx, ppcList, client.InNamespace(pod.Namespace)); err != nil { log.Error(err, "Failed to list existing PodPlacementConfigs in namespace") - // On error, proceed without PPC filtering - fail open - ppcList.Items = []multiarchv1beta1.PodPlacementConfig{} + // True fail-open for admission: admit without mutating when namespace config + // cannot be read. + return a.patchedPodResponse(pod.PodObject(), req) } // Filter to only PPCs that match this pod's labels - do this once for efficiency diff --git a/internal/controller/podplacement/suite_test.go b/internal/controller/podplacement/suite_test.go index f0ea5b4f1..8b3ea27a0 100644 --- a/internal/controller/podplacement/suite_test.go +++ b/internal/controller/podplacement/suite_test.go @@ -215,8 +215,9 @@ var _ = SynchronizedAfterSuite(func() {}, func() { err := k8sClient.Delete(ctx, builder.NewClusterPodPlacementConfig().WithName(common.SingletonResourceObjectName).Build()) Expect(err).NotTo(HaveOccurred(), "failed to delete ClusterPodPlacementConfig", err) Eventually(testingutils.ValidateDeletion(k8sClient, ctx)).Should(Succeed(), "the ClusterPodPlacementConfig should be deleted") - By("Checking the cache is empty") - Expect(clusterpodplacementconfig.GetClusterPodPlacementConfig()).To(BeNil()) + By("Waiting for the cache to become empty") + Eventually(clusterpodplacementconfig.GetClusterPodPlacementConfig). + Should(BeNil(), "cache still contains ClusterPodPlacementConfig after deletion") By("tearing down the test environment") stopMgr() @@ -274,6 +275,7 @@ func startRegistry() { // See https://github.com/distribution/distribution/blob/28c8bc6c0e4b5dfc380e0fa3058d4877fabdfa4a/registry/registry.go#L143 resp, err := httpClient.Get(fmt.Sprintf("https://%s/", registryAddress)) g.Expect(err).NotTo(HaveOccurred()) + defer resp.Body.Close() //nolint:errcheck g.Expect(resp.StatusCode).To(Equal(http.StatusOK)) }).MustPassRepeatedly(3).Should( Succeed(), "registry server is not ready yet") @@ -382,6 +384,7 @@ func runManager() { By("Starting the manager") go func() { + defer GinkgoRecover() var mgrCtx context.Context mgrCtx, stopMgr = context.WithCancel(ctx) err = mgr.Start(mgrCtx) @@ -392,6 +395,7 @@ func runManager() { Eventually(func(g Gomega) { resp, err := http.Get("http://127.0.0.1:4980/readyz") g.Expect(err).NotTo(HaveOccurred()) + defer resp.Body.Close() //nolint:errcheck g.Expect(resp.StatusCode).To(Equal(http.StatusOK)) }).MustPassRepeatedly(3).Should( Succeed(), "manager is not ready yet") diff --git a/internal/controller/podplacementconfig/suite_test.go b/internal/controller/podplacementconfig/suite_test.go index ed120d8d9..f7b9fbe4d 100644 --- a/internal/controller/podplacementconfig/suite_test.go +++ b/internal/controller/podplacementconfig/suite_test.go @@ -136,8 +136,9 @@ var _ = SynchronizedAfterSuite(func() {}, func() { err := k8sClient.Delete(ctx, builder.NewClusterPodPlacementConfig().WithName(common.SingletonResourceObjectName).Build()) Expect(err).NotTo(HaveOccurred(), "failed to delete ClusterPodPlacementConfig", err) Eventually(testingutils.ValidateDeletion(k8sClient, ctx)).Should(Succeed(), "the ClusterPodPlacementConfig should be deleted") - By("Checking the cache is empty") - Expect(clusterpodplacementconfig.GetClusterPodPlacementConfig()).To(BeNil()) + By("Waiting for the cache to become empty") + Eventually(clusterpodplacementconfig.GetClusterPodPlacementConfig). + Should(BeNil(), "cache still contains ClusterPodPlacementConfig after deletion") By("tearing down the test environment") stopMgr() @@ -207,6 +208,7 @@ func runManager() { By("Starting the manager") go func() { + defer GinkgoRecover() var mgrCtx context.Context mgrCtx, stopMgr = context.WithCancel(ctx) err = mgr.Start(mgrCtx) @@ -217,6 +219,7 @@ func runManager() { Eventually(func(g Gomega) { resp, err := http.Get("http://127.0.0.1:4980/readyz") g.Expect(err).NotTo(HaveOccurred()) + defer resp.Body.Close() //nolint:errcheck g.Expect(resp.StatusCode).To(Equal(http.StatusOK)) }).MustPassRepeatedly(3).Should( Succeed(), "manager is not ready yet") diff --git a/pkg/testing/framework/utils_test.go b/pkg/testing/framework/utils_test.go index 0f443856c..3057f6aea 100644 --- a/pkg/testing/framework/utils_test.go +++ b/pkg/testing/framework/utils_test.go @@ -90,7 +90,7 @@ func Test_GetClusterMinorVersion(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { g := NewGomegaWithT(t) - client := fake.NewSimpleClientset() + client := fake.NewClientset() client.Discovery().(*fakediscovery.FakeDiscovery).FakedServerVersion = tt.serverInfo version, err := GetClusterMinorVersion(client)