Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion internal/api/features/_topics.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"nrtcrdanns",
"netpols",
"mgselinuxcollect",
"tlscompliance"
"tlscompliance",
"operobsgen"
]
}
8 changes: 7 additions & 1 deletion internal/api/features/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@ import (
)

const (
Version = "v5.0.0"
// Version is the version of the API, and covers data layout.
// The version should be bumped when the API definition changes,
// and is not strictly tied to the operator version.
// The payload (e.g. active topics) may and will changes even
// within the same operator version; e.g. is totally legit
// to add more topics during a Z stream.
Version = "v1.0.0"

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will be really hard to track, or i'll have to be more blunt we'll simply forget to update this value.
Without an automated check it won't fly IMVHO, which I'm sure will also add more complexity.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

likely correct, but using the version as before is pointless. We can just remove the version at that point. Either it's dependable signal or should be ignored.

)

type Metadata struct {
Expand Down
19 changes: 13 additions & 6 deletions test/e2e/serial/config/fixture.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
nrtv1alpha2 "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2"

nropv1 "github.com/openshift-kni/numaresources-operator/api/v1"
"github.com/openshift-kni/numaresources-operator/internal/api/features"
intnrt "github.com/openshift-kni/numaresources-operator/internal/noderesourcetopology"
e2efixture "github.com/openshift-kni/numaresources-operator/test/internal/fixture"
"github.com/openshift-kni/numaresources-operator/test/internal/objects"
Expand All @@ -35,6 +36,7 @@ type E2EConfig struct {
NROOperObj *nropv1.NUMAResourcesOperator
NROSchedObj *nropv1.NUMAResourcesScheduler
SchedulerName string
ClusterTopics features.TopicInfo
infraNRTList nrtv1alpha2.NodeResourceTopologyList
}

Expand Down Expand Up @@ -64,43 +66,48 @@ var Config *E2EConfig

func SetupFixture() error {
var err error
Config, err = NewFixtureWithOptions("e2e-test-infra", e2efixture.OptionRandomizeName|e2efixture.OptionAvoidCooldown|e2efixture.OptionStaticClusterData)
Config, err = NewFixtureWithOptions(context.Background(), "e2e-test-infra", e2efixture.OptionRandomizeName|e2efixture.OptionAvoidCooldown|e2efixture.OptionStaticClusterData)
return err
}

func TeardownFixture() error {
return e2efixture.Teardown(Config.Fixture)
}

func NewFixtureWithOptions(nsName string, options e2efixture.Options) (*E2EConfig, error) {
func NewFixtureWithOptions(ctx context.Context, nsName string, options e2efixture.Options) (*E2EConfig, error) {
var err error
cfg := E2EConfig{
NROOperObj: &nropv1.NUMAResourcesOperator{},
NROSchedObj: &nropv1.NUMAResourcesScheduler{},
}

cfg.Fixture, err = e2efixture.SetupWithOptions(nsName, nrtv1alpha2.NodeResourceTopologyList{}, options)
cfg.Fixture, err = e2efixture.SetupWithOptions(ctx, nsName, nrtv1alpha2.NodeResourceTopologyList{}, options)
if err != nil {
return nil, err
}

err = cfg.Fixture.Client.List(context.TODO(), &cfg.infraNRTList)
err = cfg.Fixture.Client.List(ctx, &cfg.infraNRTList)
if err != nil {
return nil, err
}

err = cfg.Fixture.Client.Get(context.TODO(), objects.NROObjectKey(), cfg.NROOperObj)
err = cfg.Fixture.Client.Get(ctx, objects.NROObjectKey(), cfg.NROOperObj)
if err != nil {
return nil, err
}

err = cfg.Fixture.Client.Get(context.TODO(), objects.NROSchedObjectKey(), cfg.NROSchedObj)
err = cfg.Fixture.Client.Get(ctx, objects.NROSchedObjectKey(), cfg.NROSchedObj)
if err != nil {
return nil, err
}

cfg.SchedulerName = cfg.NROSchedObj.Status.SchedulerName
klog.InfoS("detected scheduler name", "schedulerName", cfg.SchedulerName)

cfg.ClusterTopics, err = FetchClusterTopics(ctx, cfg.Fixture, cfg.NROOperObj)
if err != nil {
return nil, err
}

return &cfg, nil
}
39 changes: 39 additions & 0 deletions test/e2e/serial/config/infra.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package config

import (
"context"
"encoding/json"
"fmt"
"os"
"sync"
Expand All @@ -32,11 +33,14 @@ import (
nrtv1alpha2 "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2"

nropv1 "github.com/openshift-kni/numaresources-operator/api/v1"
"github.com/openshift-kni/numaresources-operator/internal/api/features"
"github.com/openshift-kni/numaresources-operator/internal/nodegroups"
"github.com/openshift-kni/numaresources-operator/internal/remoteexec"
"github.com/openshift-kni/numaresources-operator/internal/wait"
"github.com/openshift-kni/numaresources-operator/pkg/objectnames"
numacellapi "github.com/openshift-kni/numaresources-operator/test/deviceplugin/pkg/numacell/api"
numacellmanifests "github.com/openshift-kni/numaresources-operator/test/deviceplugin/pkg/numacell/manifests"
"github.com/openshift-kni/numaresources-operator/test/internal/deploy"
e2efixture "github.com/openshift-kni/numaresources-operator/test/internal/fixture"
"github.com/openshift-kni/numaresources-operator/test/internal/images"

Expand Down Expand Up @@ -211,3 +215,38 @@ func unlabelNodeByName(cli client.Client, nodeName string) {
g.Expect(err).ToNot(HaveOccurred())
}).WithTimeout(3*time.Minute).WithPolling(30*time.Second).Should(Succeed(), "failed to unlabel node %q: %v", nodeName, err)
}

func FetchClusterTopics(ctx context.Context, fxt *e2efixture.Fixture, nropObj *nropv1.NUMAResourcesOperator) (features.TopicInfo, error) {
pod, err := deploy.FindNUMAResourcesOperatorPod(ctx, fxt.Client, nropObj)
if err != nil {
return features.NewTopicInfo(), err
}

cmdline := []string{
"/bin/numaresources-operator",
"--inspect-features",
}
stdout, stderr, err := remoteexec.CommandOnPod(ctx, fxt.K8sClient, pod, cmdline...)
// older version may miss introspection support that's fine
if err != nil {
return features.NewTopicInfo(), nil
}
Comment on lines +231 to +233

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Don’t swallow all introspection execution failures.

This currently treats every CommandOnPod failure as “feature unsupported”, which can mask real cluster/test failures and silently disable the stricter checks.

Suggested fix
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"os"
+	"strings"
 	"sync"
 	"time"
@@
 	stdout, stderr, err := remoteexec.CommandOnPod(ctx, fxt.K8sClient, pod, cmdline...)
-	// older version may miss introspection support that's fine
 	if err != nil {
-		return features.NewTopicInfo(), nil
+		serr := strings.ToLower(stderr)
+		// older versions may miss introspection support; only ignore that class of errors
+		if strings.Contains(serr, "inspect-features") && (strings.Contains(serr, "unknown") || strings.Contains(serr, "not found")) {
+			return features.NewTopicInfo(), nil
+		}
+		return features.NewTopicInfo(), fmt.Errorf("cannot inspect cluster features: %w (stderr=%q)", err, stderr)
 	}
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if err != nil {
return features.NewTopicInfo(), nil
}
import (
"context"
"encoding/json"
"fmt"
"os"
"strings"
"sync"
"time"
// ... rest of imports
)
stdout, stderr, err := remoteexec.CommandOnPod(ctx, fxt.K8sClient, pod, cmdline...)
if err != nil {
serr := strings.ToLower(stderr)
// older versions may miss introspection support; only ignore that class of errors
if strings.Contains(serr, "inspect-features") && (strings.Contains(serr, "unknown") || strings.Contains(serr, "not found")) {
return features.NewTopicInfo(), nil
}
return features.NewTopicInfo(), fmt.Errorf("cannot inspect cluster features: %w (stderr=%q)", err, stderr)
}
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@test/e2e/serial/config/infra.go` around lines 231 - 233, The code currently
swallows all errors from the CommandOnPod call by returning
features.NewTopicInfo(), nil; instead, change the error handling so only known
“feature unsupported” conditions are treated as non-fatal and all other errors
are returned (or wrapped) to fail the test. Locate the CommandOnPod invocation
and replace the unconditional if err != nil { return features.NewTopicInfo(),
nil } with logic that inspects the error (type or message) and for genuine
unsupported indicators returns features.NewTopicInfo(), nil, but otherwise
returns nil, fmt.Errorf("introspection failed: %w", err) (or simply return the
error) so real cluster/test failures aren’t masked.


if len(stderr) > 0 {
klog.InfoS("while fetching cluster topics", "stderr", stderr)
}

var tp features.TopicInfo
err = json.Unmarshal(stdout, &tp)
if err != nil {
return features.NewTopicInfo(), err
}
klog.InfoS("active features from the deployed operator", "features", string(stdout))

tp.Metadata.Version = features.Version
err = tp.Validate()
if err != nil {
return features.NewTopicInfo(), err
}
return tp, nil
}
43 changes: 26 additions & 17 deletions test/e2e/serial/tests/configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"errors"
"fmt"
"reflect"
"slices"
"sort"
"strconv"
"strings"
Expand Down Expand Up @@ -234,25 +235,29 @@ var _ = Describe("[serial][disruptive] numaresources configuration management",
g.Expect(isNROOperSyncedAt(&updatedOperObj.Status, status.ConditionAvailable, updatedOperObj.Generation)).To(Succeed())
}).WithTimeout(10*time.Minute).WithPolling(30*time.Second).Should(Succeed(), "failed to update RTE daemonset node selector")

dss, err := objects.GetDaemonSetsByNamespacedName(fxt.Client, ctx, updatedOperObj.Status.DaemonSets...)
Expect(err).ToNot(HaveOccurred())
// Eventually is not strictly needed but till all the supported version report ObservedGeneration in status
// (aka oldest supported version is 4.20) we can't really depend on `isNROOperUpToDate`
Eventually(func(g Gomega) {
dss, err := objects.GetDaemonSetsByNamespacedName(fxt.Client, ctx, updatedOperObj.Status.DaemonSets...)
g.Expect(err).ToNot(HaveOccurred())

Expect(dss).To(HaveLen(len(nroOperObj.Spec.NodeGroups)), "daemonsets found owned by NRO object doesn't align with specified NodeGroups")
for _, ds := range dss {
e2efixture.By("check RTE daemonset %q", ds.Name)
if ds.Name == objectnames.GetComponentName(updatedOperObj.Name, roleMCPTest) {
By("check the correct match labels for the new RTE daemonset")
Expect(ds.Spec.Template.Spec.NodeSelector).To(Equal(testMCP.Spec.NodeSelector.MatchLabels))
g.Expect(dss).To(HaveLen(len(nroOperObj.Spec.NodeGroups)), "daemonsets found owned by NRO object doesn't align with specified NodeGroups")
for _, ds := range dss {
e2efixture.By("check RTE daemonset %q", ds.Name)
if ds.Name == objectnames.GetComponentName(updatedOperObj.Name, roleMCPTest) {
By("check the correct match labels for the new RTE daemonset")
g.Expect(ds.Spec.Template.Spec.NodeSelector).To(Equal(testMCP.Spec.NodeSelector.MatchLabels))
}
By("check the correct image")
cnt := ds.Spec.Template.Spec.Containers[0] // shortcut
g.Expect(cnt.Image).To(Equal(serialconfig.GetRteCiImage()))

By("checking the correct LogLevel")
found, match := matchLogLevelToKlog(&cnt, newLogLevel)
g.Expect(found).To(BeTrue(), "-v flag doesn't exist in container %q args under DaemonSet: %q", cnt.Name, ds.Name)
g.Expect(match).To(BeTrue(), "LogLevel %s doesn't match the existing -v flag in container: %q managed by DaemonSet: %q", updatedOperObj.Spec.LogLevel, cnt.Name, ds.Name)
}
By("check the correct image")
cnt := ds.Spec.Template.Spec.Containers[0] // shortcut
Expect(cnt.Image).To(Equal(serialconfig.GetRteCiImage()))

By("checking the correct LogLevel")
found, match := matchLogLevelToKlog(&cnt, newLogLevel)
Expect(found).To(BeTrue(), "-v flag doesn't exist in container %q args under DaemonSet: %q", cnt.Name, ds.Name)
Expect(match).To(BeTrue(), "LogLevel %s doesn't match the existing -v flag in container: %q managed by DaemonSet: %q", updatedOperObj.Spec.LogLevel, cnt.Name, ds.Name)
}
}).WithTimeout(10*time.Minute).WithPolling(30*time.Second).Should(Succeed(), "failed to update RTE daemonset node selector")
})

It("should converge with mixed SELinux policy across node groups", Label(label.Tier2, label.OpenShift), func(ctx context.Context) {
Expand Down Expand Up @@ -1844,6 +1849,10 @@ func isNROOperSyncedAt(nropStatus *nropv1.NUMAResourcesOperatorStatus, condition
if cond.Status != metav1.ConditionTrue {
return errors.New("condition not true")
}
// TODO: until all the version reports ObservedGeneration
if serialconfig.Config != nil && !slices.Contains(serialconfig.Config.ClusterTopics.Active, "operobsgen") {
return nil
}
if cond.ObservedGeneration != gen {
return fmt.Errorf("condition at %d expected %d", cond.ObservedGeneration, gen)
}
Expand Down
5 changes: 2 additions & 3 deletions test/internal/fixture/fixture.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,7 @@ func init() {
cooldownThreshold = getCooldownThresholdFromEnvVar()
}

func SetupWithOptions(name string, nrtList nrtv1alpha2.NodeResourceTopologyList, options Options) (*Fixture, error) {
ctx := context.Background()
func SetupWithOptions(ctx context.Context, name string, nrtList nrtv1alpha2.NodeResourceTopologyList, options Options) (*Fixture, error) {
if !e2eclient.ClientsEnabled {
return nil, fmt.Errorf("clients not enabled")
}
Expand Down Expand Up @@ -191,7 +190,7 @@ func SetupWithOptions(name string, nrtList nrtv1alpha2.NodeResourceTopologyList,
}

func Setup(baseName string, nrtList nrtv1alpha2.NodeResourceTopologyList) (*Fixture, error) {
return SetupWithOptions(baseName, nrtList, OptionRandomizeName)
return SetupWithOptions(context.Background(), baseName, nrtList, OptionRandomizeName)
}

func Teardown(ft *Fixture) error {
Expand Down
Loading