diff --git "a/changelogs/unreleased/9663-Lyndon-Li\342\200\216\342\200\216" b/changelogs/unreleased/9683-Lyndon-Li similarity index 100% rename from "changelogs/unreleased/9663-Lyndon-Li\342\200\216\342\200\216" rename to changelogs/unreleased/9683-Lyndon-Li diff --git a/changelogs/unreleased/9697-Joeavaikath b/changelogs/unreleased/9697-Joeavaikath new file mode 100644 index 0000000000..ad8e5eb2e2 --- /dev/null +++ b/changelogs/unreleased/9697-Joeavaikath @@ -0,0 +1 @@ +Fail backup validation when built-in data mover is requested but no node-agent pods are running diff --git a/pkg/backup/actions/csi/pvc_action_test.go b/pkg/backup/actions/csi/pvc_action_test.go index efcb0b0abf..0ae72976fe 100644 --- a/pkg/backup/actions/csi/pvc_action_test.go +++ b/pkg/backup/actions/csi/pvc_action_test.go @@ -94,6 +94,7 @@ func TestExecute(t *testing.T) { expectedDataUpload *velerov2alpha1.DataUpload expectedPVC *corev1api.PersistentVolumeClaim resourcePolicy *corev1api.ConfigMap + extraObjects []runtime.Object failVSCreate bool skipVSReadyUpdate bool // New flag to control VS readiness }{ @@ -122,12 +123,15 @@ func TestExecute(t *testing.T) { expectErr: true, // Expect an error, but the exact message can vary }, { - name: "Test SnapshotMoveData", - backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(), - pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(), - pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(), - sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(), - vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), + name: "Test SnapshotMoveData", + backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(), + pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(), + pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(), + sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(), + vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), + extraObjects: []runtime.Object{ + builder.ForPod("velero", "node-agent-pod").Labels(map[string]string{"role": "node-agent"}).Phase(corev1api.PodRunning).NodeName("test-node").Result(), + }, operationID: ".", expectedDataUpload: &velerov2alpha1.DataUpload{ TypeMeta: metav1.TypeMeta{ @@ -167,12 +171,15 @@ func TestExecute(t *testing.T) { }, }, { - name: "Verify PVC is modified as expected", - backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(), - pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(), - pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(), - sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(), - vsClass: builder.ForVolumeSnapshotClass("tescVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), + name: "Verify PVC is modified as expected", + backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(), + pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(), + pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(), + sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(), + vsClass: builder.ForVolumeSnapshotClass("tescVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), + extraObjects: []runtime.Object{ + builder.ForPod("velero", "node-agent-pod").Labels(map[string]string{"role": "node-agent"}).Phase(corev1api.PodRunning).NodeName("test-node").Result(), + }, operationID: ".", expectedPVC: builder.ForPersistentVolumeClaim("velero", "testPVC"). ObjectMeta(builder.WithAnnotations(velerov1api.MustIncludeAdditionalItemAnnotation, "true", velerov1api.DataUploadNameAnnotation, "velero/"), @@ -210,6 +217,7 @@ func TestExecute(t *testing.T) { if tc.resourcePolicy != nil { objects = append(objects, tc.resourcePolicy) } + objects = append(objects, tc.extraObjects...) var crClient crclient.Client if tc.failVSCreate { diff --git a/pkg/controller/backup_controller.go b/pkg/controller/backup_controller.go index 496308a6e0..51bbd9850b 100644 --- a/pkg/controller/backup_controller.go +++ b/pkg/controller/backup_controller.go @@ -49,10 +49,12 @@ import ( velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" pkgbackup "github.com/vmware-tanzu/velero/pkg/backup" "github.com/vmware-tanzu/velero/pkg/constant" + "github.com/vmware-tanzu/velero/pkg/datamover" "github.com/vmware-tanzu/velero/pkg/discovery" "github.com/vmware-tanzu/velero/pkg/features" "github.com/vmware-tanzu/velero/pkg/label" "github.com/vmware-tanzu/velero/pkg/metrics" + "github.com/vmware-tanzu/velero/pkg/nodeagent" "github.com/vmware-tanzu/velero/pkg/persistence" "github.com/vmware-tanzu/velero/pkg/plugin/clientmgmt" "github.com/vmware-tanzu/velero/pkg/plugin/framework" @@ -494,6 +496,17 @@ func (b *backupReconciler) prepareBackupRequest(ctx context.Context, backup *vel } } + // validate that the node-agent daemonset is ready when snapshot data movement with + // the built-in data mover is requested. Without this, the DataUpload CR will be + // created but never processed (the DataUpload controller runs inside node-agent), + // causing the backup to hang until itemOperationTimeout expires. + if boolptr.IsSetToTrue(request.Spec.SnapshotMoveData) && datamover.IsBuiltInUploader(request.Spec.DataMover) { + if err := nodeagent.IsReady(ctx, request.Namespace, b.kbClient); err != nil { + request.Status.ValidationErrors = append(request.Status.ValidationErrors, + fmt.Sprintf("node-agent is not ready: %v; the built-in data mover requires node-agent to be deployed and running", err)) + } + } + // Getting all information of cluster version - useful for future skip-level migration if request.Annotations == nil { request.Annotations = make(map[string]string) diff --git a/pkg/controller/backup_controller_test.go b/pkg/controller/backup_controller_test.go index 3864989002..313a71b251 100644 --- a/pkg/controller/backup_controller_test.go +++ b/pkg/controller/backup_controller_test.go @@ -33,6 +33,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" + appsv1api "k8s.io/api/apps/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -492,6 +493,97 @@ func TestDefaultBackupTTL(t *testing.T) { } } +func TestPrepareBackupRequest_NodeAgentValidation(t *testing.T) { + now, err := time.Parse(time.RFC1123Z, time.RFC1123Z) + require.NoError(t, err) + + nodeAgentDS := &appsv1api.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: velerov1api.DefaultNamespace, + Name: "node-agent", + }, + Status: appsv1api.DaemonSetStatus{NumberReady: 3}, + } + + nodeAgentDSNotReady := &appsv1api.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: velerov1api.DefaultNamespace, + Name: "node-agent", + }, + Status: appsv1api.DaemonSetStatus{NumberReady: 0}, + } + + tests := []struct { + name string + backup *velerov1api.Backup + objs []runtime.Object + expectNodeAgent bool + }{ + { + name: "snapshotMoveData with no node-agent daemonset", + backup: defaultBackup().SnapshotMoveData(true).Result(), + expectNodeAgent: true, + }, + { + name: "snapshotMoveData with node-agent daemonset not ready", + backup: defaultBackup().SnapshotMoveData(true).Result(), + objs: []runtime.Object{nodeAgentDSNotReady}, + expectNodeAgent: true, + }, + { + name: "snapshotMoveData with ready node-agent daemonset", + backup: defaultBackup().SnapshotMoveData(true).Result(), + objs: []runtime.Object{nodeAgentDS}, + }, + { + name: "snapshotMoveData with custom data mover skips check", + backup: defaultBackup().SnapshotMoveData(true).DataMover("custom-mover").Result(), + }, + { + name: "snapshotMoveData disabled skips check", + backup: defaultBackup().Result(), + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + formatFlag := logging.FormatText + logger := logging.DefaultLogger(logrus.DebugLevel, formatFlag) + apiServer := velerotest.NewAPIServer(t) + discoveryHelper, err := discovery.NewHelper(apiServer.DiscoveryClient, logger) + require.NoError(t, err) + + fakeClient := velerotest.NewFakeControllerRuntimeClient(t, test.objs...) + + c := &backupReconciler{ + discoveryHelper: discoveryHelper, + kbClient: fakeClient, + defaultBackupTTL: 24 * 30 * time.Hour, + clock: testclocks.NewFakeClock(now), + formatFlag: formatFlag, + } + + res := c.prepareBackupRequest(ctx, test.backup, logger) + defer res.WorkerPool.Stop() + + if test.expectNodeAgent { + hasNodeAgentError := false + for _, e := range res.Status.ValidationErrors { + if strings.Contains(e, "node-agent") { + hasNodeAgentError = true + break + } + } + assert.True(t, hasNodeAgentError, "expected a node-agent validation error") + } else { + for _, e := range res.Status.ValidationErrors { + assert.NotContains(t, e, "node-agent") + } + } + }) + } +} + func TestPrepareBackupRequest_SetsVGSLabelKey(t *testing.T) { now, err := time.Parse(time.RFC1123Z, time.RFC1123Z) require.NoError(t, err) @@ -671,6 +763,17 @@ func TestProcessBackupCompletions(t *testing.T) { now = now.Local() timestamp := metav1.NewTime(now) + // Node-agent daemonset is needed for all test cases so that the node-agent + // validation in prepareBackupRequest does not reject backups that use + // snapshot data movement. + nodeAgentDS := &appsv1api.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: velerov1api.DefaultNamespace, + Name: "node-agent", + }, + Status: appsv1api.DaemonSetStatus{NumberReady: 3}, + } + tests := []struct { name string backup *velerov1api.Backup @@ -1515,6 +1618,7 @@ func TestProcessBackupCompletions(t *testing.T) { builder.ForVolumeSnapshotContent("testVSC").ObjectMeta(builder.WithLabels(velerov1api.BackupNameLabel, "backup-1")).VolumeSnapshotClassName("testClass").Status(&snapshotv1api.VolumeSnapshotContentStatus{ SnapshotHandle: &snapshotHandle, }).Result(), + nodeAgentDS, ) } else { fakeClient = velerotest.NewFakeControllerRuntimeClient(t, @@ -1522,6 +1626,7 @@ func TestProcessBackupCompletions(t *testing.T) { builder.ForVolumeSnapshotContent("testVSC").ObjectMeta(builder.WithLabels(velerov1api.BackupNameLabel, "backup-1")).VolumeSnapshotClassName("testClass").Status(&snapshotv1api.VolumeSnapshotContentStatus{ SnapshotHandle: &snapshotHandle, }).Result(), + nodeAgentDS, ) } diff --git a/pkg/nodeagent/node_agent.go b/pkg/nodeagent/node_agent.go index 87efb896a6..ed93926aec 100644 --- a/pkg/nodeagent/node_agent.go +++ b/pkg/nodeagent/node_agent.go @@ -22,6 +22,7 @@ import ( "fmt" "github.com/pkg/errors" + appsv1api "k8s.io/api/apps/v1" corev1api "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -80,6 +81,20 @@ func KbClientIsRunningInNode(ctx context.Context, namespace string, nodeName str return isRunningInNode(ctx, namespace, nodeName, nil, kubeClient) } +// IsReady checks whether the node-agent daemonset has at least one ready pod by inspecting the DaemonSet status. +func IsReady(ctx context.Context, namespace string, crClient ctrlclient.Client) error { + ds := new(appsv1api.DaemonSet) + if err := crClient.Get(ctx, ctrlclient.ObjectKey{Namespace: namespace, Name: daemonSet}, ds); err != nil { + return errors.Wrap(err, "failed to get node-agent daemonset") + } + + if ds.Status.NumberReady == 0 { + return errors.New("node-agent daemonset has no ready pods") + } + + return nil +} + // IsRunningInNode checks if the node agent pod is running properly in a specified node through controller client. If not, return the error found func IsRunningInNode(ctx context.Context, namespace string, nodeName string, crClient ctrlclient.Client) error { return isRunningInNode(ctx, namespace, nodeName, crClient, nil) diff --git a/pkg/nodeagent/node_agent_test.go b/pkg/nodeagent/node_agent_test.go index 168e91de10..ede4e9f85c 100644 --- a/pkg/nodeagent/node_agent_test.go +++ b/pkg/nodeagent/node_agent_test.go @@ -213,6 +213,64 @@ func TestIsRunningInNode(t *testing.T) { } } +func TestIsReady(t *testing.T) { + scheme := runtime.NewScheme() + appsv1api.AddToScheme(scheme) + + dsNotReady := &appsv1api.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Namespace: "fake-ns", Name: "node-agent"}, + Status: appsv1api.DaemonSetStatus{NumberReady: 0}, + } + dsReady := &appsv1api.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Namespace: "fake-ns", Name: "node-agent"}, + Status: appsv1api.DaemonSetStatus{NumberReady: 3}, + } + + tests := []struct { + name string + kubeClientObj []runtime.Object + namespace string + expectErr string + }{ + { + name: "daemonset not found", + namespace: "fake-ns", + expectErr: "failed to get node-agent daemonset: daemonsets.apps \"node-agent\" not found", + }, + { + name: "daemonset exists but no ready pods", + namespace: "fake-ns", + kubeClientObj: []runtime.Object{ + dsNotReady, + }, + expectErr: "node-agent daemonset has no ready pods", + }, + { + name: "daemonset has ready pods", + namespace: "fake-ns", + kubeClientObj: []runtime.Object{ + dsReady, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + fakeClient := clientFake.NewClientBuilder(). + WithScheme(scheme). + WithRuntimeObjects(test.kubeClientObj...). + Build() + + err := IsReady(t.Context(), test.namespace, fakeClient) + if test.expectErr == "" { + assert.NoError(t, err) + } else { + assert.EqualError(t, err, test.expectErr) + } + }) + } +} + func TestGetPodSpec(t *testing.T) { podSpec := corev1api.PodSpec{ NodeName: "fake-node",