From ab904533387be689633fed13de88e015df42ab86 Mon Sep 17 00:00:00 2001 From: Joseph Date: Thu, 9 Apr 2026 15:20:48 -0400 Subject: [PATCH 01/11] Add node-agent pod check to DataMover backup path When SnapshotMoveData is enabled but no node-agent pods are running, DataUpload CRs sit unprocessed until timeout. This adds a pre-flight check in the PVC backup item action that verifies running node-agent pods exist before creating DataUpload CRs, mirroring the check FSB already performs. Cleans up the VolumeSnapshot on failure to prevent orphaned resources. Signed-off-by: Joseph --- pkg/backup/actions/csi/pvc_action.go | 7 +++ pkg/backup/actions/csi/pvc_action_test.go | 31 ++++++++--- pkg/nodeagent/node_agent.go | 25 +++++++++ pkg/nodeagent/node_agent_test.go | 66 +++++++++++++++++++++++ 4 files changed, 122 insertions(+), 7 deletions(-) diff --git a/pkg/backup/actions/csi/pvc_action.go b/pkg/backup/actions/csi/pvc_action.go index ac5f71a98e..98979f10c0 100644 --- a/pkg/backup/actions/csi/pvc_action.go +++ b/pkg/backup/actions/csi/pvc_action.go @@ -49,6 +49,7 @@ import ( veleroclient "github.com/vmware-tanzu/velero/pkg/client" "github.com/vmware-tanzu/velero/pkg/kuberesource" "github.com/vmware-tanzu/velero/pkg/label" + "github.com/vmware-tanzu/velero/pkg/nodeagent" plugincommon "github.com/vmware-tanzu/velero/pkg/plugin/framework/common" "github.com/vmware-tanzu/velero/pkg/plugin/utils/volumehelper" "github.com/vmware-tanzu/velero/pkg/plugin/velero" @@ -384,6 +385,12 @@ func (p *pvcBackupItemAction) Execute( dataUploadLog.Info("Starting data upload of backup") + if err := nodeagent.HasRunningPods(context.Background(), backup.Namespace, p.crClient); err != nil { + dataUploadLog.WithError(err).Error("failed to check for running node-agent pods") + csi.CleanupVolumeSnapshot(vs, p.crClient, p.log) + return nil, nil, "", nil, errors.Wrap(err, "snapshot data movement requires a running node-agent daemonset; ensure node-agent is deployed and running") + } + dataUpload, err := createDataUpload( context.Background(), backup, diff --git a/pkg/backup/actions/csi/pvc_action_test.go b/pkg/backup/actions/csi/pvc_action_test.go index efcb0b0abf..10b586b3d2 100644 --- a/pkg/backup/actions/csi/pvc_action_test.go +++ b/pkg/backup/actions/csi/pvc_action_test.go @@ -94,6 +94,7 @@ func TestExecute(t *testing.T) { expectedDataUpload *velerov2alpha1.DataUpload expectedPVC *corev1api.PersistentVolumeClaim resourcePolicy *corev1api.ConfigMap + extraObjects []runtime.Object failVSCreate bool skipVSReadyUpdate bool // New flag to control VS readiness }{ @@ -122,12 +123,24 @@ func TestExecute(t *testing.T) { expectErr: true, // Expect an error, but the exact message can vary }, { - name: "Test SnapshotMoveData", + name: "Fail SnapshotMoveData when no node-agent pods", backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(), pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(), pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(), sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(), vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), + expectedErr: errors.New("snapshot data movement requires a running node-agent daemonset; ensure node-agent is deployed and running: no running node-agent pods found"), + }, + { + name: "Test SnapshotMoveData", + backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(), + pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(), + pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(), + sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(), + vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), + extraObjects: []runtime.Object{ + builder.ForPod("velero", "node-agent-pod").Labels(map[string]string{"role": "node-agent"}).Phase(corev1api.PodRunning).NodeName("test-node").Result(), + }, operationID: ".", expectedDataUpload: &velerov2alpha1.DataUpload{ TypeMeta: metav1.TypeMeta{ @@ -167,12 +180,15 @@ func TestExecute(t *testing.T) { }, }, { - name: "Verify PVC is modified as expected", - backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(), - pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(), - pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(), - sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(), - vsClass: builder.ForVolumeSnapshotClass("tescVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), + name: "Verify PVC is modified as expected", + backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(), + pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(), + pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(), + sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(), + vsClass: builder.ForVolumeSnapshotClass("tescVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), + extraObjects: []runtime.Object{ + builder.ForPod("velero", "node-agent-pod").Labels(map[string]string{"role": "node-agent"}).Phase(corev1api.PodRunning).NodeName("test-node").Result(), + }, operationID: ".", expectedPVC: builder.ForPersistentVolumeClaim("velero", "testPVC"). ObjectMeta(builder.WithAnnotations(velerov1api.MustIncludeAdditionalItemAnnotation, "true", velerov1api.DataUploadNameAnnotation, "velero/"), @@ -210,6 +226,7 @@ func TestExecute(t *testing.T) { if tc.resourcePolicy != nil { objects = append(objects, tc.resourcePolicy) } + objects = append(objects, tc.extraObjects...) var crClient crclient.Client if tc.failVSCreate { diff --git a/pkg/nodeagent/node_agent.go b/pkg/nodeagent/node_agent.go index 87efb896a6..cc616ba61a 100644 --- a/pkg/nodeagent/node_agent.go +++ b/pkg/nodeagent/node_agent.go @@ -80,6 +80,31 @@ func KbClientIsRunningInNode(ctx context.Context, namespace string, nodeName str return isRunningInNode(ctx, namespace, nodeName, nil, kubeClient) } +// HasRunningPods checks if any node agent pod is running in the namespace through controller client. If not, return the error found. +func HasRunningPods(ctx context.Context, namespace string, crClient ctrlclient.Client) error { + pods := new(corev1api.PodList) + parsedSelector, err := labels.Parse(fmt.Sprintf("role=%s", nodeAgentRole)) + if err != nil { + return errors.Wrap(err, "fail to parse selector") + } + + err = crClient.List(ctx, pods, &ctrlclient.ListOptions{ + LabelSelector: parsedSelector, + Namespace: namespace, + }) + if err != nil { + return errors.Wrap(err, "failed to list node-agent pods") + } + + for i := range pods.Items { + if kube.IsPodRunning(&pods.Items[i]) == nil { + return nil + } + } + + return errors.New("no running node-agent pods found") +} + // IsRunningInNode checks if the node agent pod is running properly in a specified node through controller client. If not, return the error found func IsRunningInNode(ctx context.Context, namespace string, nodeName string, crClient ctrlclient.Client) error { return isRunningInNode(ctx, namespace, nodeName, crClient, nil) diff --git a/pkg/nodeagent/node_agent_test.go b/pkg/nodeagent/node_agent_test.go index 168e91de10..49f7e7f175 100644 --- a/pkg/nodeagent/node_agent_test.go +++ b/pkg/nodeagent/node_agent_test.go @@ -213,6 +213,72 @@ func TestIsRunningInNode(t *testing.T) { } } +func TestHasRunningPods(t *testing.T) { + scheme := runtime.NewScheme() + corev1api.AddToScheme(scheme) + + nonNodeAgentPod := builder.ForPod("fake-ns", "fake-pod").Result() + nodeAgentPodNotRunning := builder.ForPod("fake-ns", "fake-pod-pending").Labels(map[string]string{"role": "node-agent"}).Result() + nodeAgentPodRunning := builder.ForPod("fake-ns", "fake-pod-running"). + Labels(map[string]string{"role": "node-agent"}). + Phase(corev1api.PodRunning). + NodeName("fake-node"). + Result() + + tests := []struct { + name string + kubeClientObj []runtime.Object + namespace string + expectErr string + }{ + { + name: "no pods at all", + namespace: "fake-ns", + expectErr: "no running node-agent pods found", + }, + { + name: "only non-node-agent pods", + namespace: "fake-ns", + kubeClientObj: []runtime.Object{ + nonNodeAgentPod, + }, + expectErr: "no running node-agent pods found", + }, + { + name: "node-agent pods exist but none running", + namespace: "fake-ns", + kubeClientObj: []runtime.Object{ + nodeAgentPodNotRunning, + }, + expectErr: "no running node-agent pods found", + }, + { + name: "at least one running node-agent pod", + namespace: "fake-ns", + kubeClientObj: []runtime.Object{ + nodeAgentPodNotRunning, + nodeAgentPodRunning, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + fakeClientBuilder := clientFake.NewClientBuilder() + fakeClientBuilder = fakeClientBuilder.WithScheme(scheme) + + fakeClient := fakeClientBuilder.WithRuntimeObjects(test.kubeClientObj...).Build() + + err := HasRunningPods(t.Context(), test.namespace, fakeClient) + if test.expectErr == "" { + assert.NoError(t, err) + } else { + assert.EqualError(t, err, test.expectErr) + } + }) + } +} + func TestGetPodSpec(t *testing.T) { podSpec := corev1api.PodSpec{ NodeName: "fake-node", From dca63e76cc0a889cd2372e7e0ae2a1189948fb86 Mon Sep 17 00:00:00 2001 From: Joseph Date: Thu, 9 Apr 2026 15:24:21 -0400 Subject: [PATCH 02/11] Changelog Signed-off-by: Joseph --- changelogs/unreleased/9697-Joeavaikath | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelogs/unreleased/9697-Joeavaikath diff --git a/changelogs/unreleased/9697-Joeavaikath b/changelogs/unreleased/9697-Joeavaikath new file mode 100644 index 0000000000..425c854190 --- /dev/null +++ b/changelogs/unreleased/9697-Joeavaikath @@ -0,0 +1 @@ +Add node-agent pod check to DataMover backup path From d5e4acd917c4a5c6fde9be1fb452c96c32e4dd04 Mon Sep 17 00:00:00 2001 From: Joseph Date: Fri, 10 Apr 2026 09:10:53 -0400 Subject: [PATCH 03/11] Scope node-agent check to built-in data mover only Custom data movers operate independently of node-agent, so the HasRunningPods check should only run when the built-in "velero" data mover is in use. Adds test case verifying custom data movers bypass the check. Signed-off-by: Joseph --- pkg/backup/actions/csi/pvc_action.go | 11 ++++-- pkg/backup/actions/csi/pvc_action_test.go | 46 +++++++++++++++++++++++ 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/pkg/backup/actions/csi/pvc_action.go b/pkg/backup/actions/csi/pvc_action.go index 98979f10c0..fdc35213e8 100644 --- a/pkg/backup/actions/csi/pvc_action.go +++ b/pkg/backup/actions/csi/pvc_action.go @@ -47,6 +47,7 @@ import ( velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" velerov2alpha1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v2alpha1" veleroclient "github.com/vmware-tanzu/velero/pkg/client" + "github.com/vmware-tanzu/velero/pkg/datamover" "github.com/vmware-tanzu/velero/pkg/kuberesource" "github.com/vmware-tanzu/velero/pkg/label" "github.com/vmware-tanzu/velero/pkg/nodeagent" @@ -385,10 +386,12 @@ func (p *pvcBackupItemAction) Execute( dataUploadLog.Info("Starting data upload of backup") - if err := nodeagent.HasRunningPods(context.Background(), backup.Namespace, p.crClient); err != nil { - dataUploadLog.WithError(err).Error("failed to check for running node-agent pods") - csi.CleanupVolumeSnapshot(vs, p.crClient, p.log) - return nil, nil, "", nil, errors.Wrap(err, "snapshot data movement requires a running node-agent daemonset; ensure node-agent is deployed and running") + if datamover.IsBuiltInUploader(backup.Spec.DataMover) { + if err := nodeagent.HasRunningPods(context.Background(), backup.Namespace, p.crClient); err != nil { + dataUploadLog.WithError(err).Error("cannot perform snapshot data movement without running node-agent pods") + csi.CleanupVolumeSnapshot(vs, p.crClient, p.log) + return nil, nil, "", nil, errors.Wrap(err, "snapshot data movement requires a running node-agent daemonset; ensure node-agent is deployed and running") + } } dataUpload, err := createDataUpload( diff --git a/pkg/backup/actions/csi/pvc_action_test.go b/pkg/backup/actions/csi/pvc_action_test.go index 10b586b3d2..cc9f8f68ba 100644 --- a/pkg/backup/actions/csi/pvc_action_test.go +++ b/pkg/backup/actions/csi/pvc_action_test.go @@ -131,6 +131,52 @@ func TestExecute(t *testing.T) { vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), expectedErr: errors.New("snapshot data movement requires a running node-agent daemonset; ensure node-agent is deployed and running: no running node-agent pods found"), }, + { + name: "Skip node-agent check for custom data mover", + backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).DataMover("custom-mover").CSISnapshotTimeout(1 * time.Minute).Result(), + pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(), + pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(), + sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(), + vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), + operationID: ".", + expectedDataUpload: &velerov2alpha1.DataUpload{ + TypeMeta: metav1.TypeMeta{ + Kind: "DataUpload", + APIVersion: velerov2alpha1.SchemeGroupVersion.String(), + }, + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-", + Namespace: "velero", + Labels: map[string]string{ + velerov1api.BackupNameLabel: "test", + velerov1api.BackupUIDLabel: "", + velerov1api.PVCUIDLabel: "", + velerov1api.AsyncOperationIDLabel: "du-.", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "velero.io/v1", + Kind: "Backup", + Name: "test", + UID: "", + Controller: &boolTrue, + }, + }, + }, + Spec: velerov2alpha1.DataUploadSpec{ + SnapshotType: velerov2alpha1.SnapshotTypeCSI, + CSISnapshot: &velerov2alpha1.CSISnapshotSpec{ + VolumeSnapshot: "", + StorageClass: "testSC", + SnapshotClass: "testVSClass", + }, + SourcePVC: "testPVC", + DataMover: "custom-mover", + SourceNamespace: "velero", + OperationTimeout: metav1.Duration{Duration: 1 * time.Minute}, + }, + }, + }, { name: "Test SnapshotMoveData", backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(), From 92a567e50c3d99374b0fc30db9baf5fa2bed8378 Mon Sep 17 00:00:00 2001 From: Joseph Date: Fri, 10 Apr 2026 09:17:50 -0400 Subject: [PATCH 04/11] fmt fix Signed-off-by: Joseph --- pkg/backup/actions/csi/pvc_action_test.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/backup/actions/csi/pvc_action_test.go b/pkg/backup/actions/csi/pvc_action_test.go index cc9f8f68ba..f2b1ee6315 100644 --- a/pkg/backup/actions/csi/pvc_action_test.go +++ b/pkg/backup/actions/csi/pvc_action_test.go @@ -132,12 +132,12 @@ func TestExecute(t *testing.T) { expectedErr: errors.New("snapshot data movement requires a running node-agent daemonset; ensure node-agent is deployed and running: no running node-agent pods found"), }, { - name: "Skip node-agent check for custom data mover", - backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).DataMover("custom-mover").CSISnapshotTimeout(1 * time.Minute).Result(), - pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(), - pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(), - sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(), - vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), + name: "Skip node-agent check for custom data mover", + backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).DataMover("custom-mover").CSISnapshotTimeout(1 * time.Minute).Result(), + pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(), + pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(), + sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(), + vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), operationID: ".", expectedDataUpload: &velerov2alpha1.DataUpload{ TypeMeta: metav1.TypeMeta{ From fc0bbc997f044262a55d7f8eb1b094089e71ae6e Mon Sep 17 00:00:00 2001 From: Joseph Date: Mon, 13 Apr 2026 10:35:34 -0400 Subject: [PATCH 05/11] Move check further up Signed-off-by: Joseph --- pkg/backup/actions/csi/pvc_action.go | 17 +++++++++-------- pkg/backup/actions/csi/pvc_action_test.go | 15 ++++++++------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/pkg/backup/actions/csi/pvc_action.go b/pkg/backup/actions/csi/pvc_action.go index fdc35213e8..ce7528edd1 100644 --- a/pkg/backup/actions/csi/pvc_action.go +++ b/pkg/backup/actions/csi/pvc_action.go @@ -337,6 +337,15 @@ func (p *pvcBackupItemAction) Execute( return nil, nil, "", nil, err } + // Fast-fail before creating the snapshot if snapshot data movement is + // requested with the built-in data mover but no node-agent pods are running. + if boolptr.IsSetToTrue(backup.Spec.SnapshotMoveData) && datamover.IsBuiltInUploader(backup.Spec.DataMover) { + if err := nodeagent.HasRunningPods(context.Background(), backup.Namespace, p.crClient); err != nil { + p.log.WithError(err).Error("cannot perform snapshot data movement without running node-agent pods") + return nil, nil, "", nil, errors.Wrap(err, "snapshot data movement requires a running node-agent daemonset; ensure node-agent is deployed and running") + } + } + vs, err := p.getVolumeSnapshotReference(context.TODO(), pvc, backup) if err != nil { return nil, nil, "", nil, err @@ -386,14 +395,6 @@ func (p *pvcBackupItemAction) Execute( dataUploadLog.Info("Starting data upload of backup") - if datamover.IsBuiltInUploader(backup.Spec.DataMover) { - if err := nodeagent.HasRunningPods(context.Background(), backup.Namespace, p.crClient); err != nil { - dataUploadLog.WithError(err).Error("cannot perform snapshot data movement without running node-agent pods") - csi.CleanupVolumeSnapshot(vs, p.crClient, p.log) - return nil, nil, "", nil, errors.Wrap(err, "snapshot data movement requires a running node-agent daemonset; ensure node-agent is deployed and running") - } - } - dataUpload, err := createDataUpload( context.Background(), backup, diff --git a/pkg/backup/actions/csi/pvc_action_test.go b/pkg/backup/actions/csi/pvc_action_test.go index f2b1ee6315..1617075e4c 100644 --- a/pkg/backup/actions/csi/pvc_action_test.go +++ b/pkg/backup/actions/csi/pvc_action_test.go @@ -123,13 +123,14 @@ func TestExecute(t *testing.T) { expectErr: true, // Expect an error, but the exact message can vary }, { - name: "Fail SnapshotMoveData when no node-agent pods", - backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(), - pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(), - pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(), - sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(), - vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), - expectedErr: errors.New("snapshot data movement requires a running node-agent daemonset; ensure node-agent is deployed and running: no running node-agent pods found"), + name: "Fail SnapshotMoveData when no node-agent pods", + backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(), + pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(), + pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(), + sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(), + vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), + skipVSReadyUpdate: true, + expectedErr: errors.New("snapshot data movement requires a running node-agent daemonset; ensure node-agent is deployed and running: no running node-agent pods found"), }, { name: "Skip node-agent check for custom data mover", From 8d10cebff82dbd3e98aecb44bf125d5b95d5fac1 Mon Sep 17 00:00:00 2001 From: Joseph Date: Mon, 13 Apr 2026 10:52:59 -0400 Subject: [PATCH 06/11] Fmt Signed-off-by: Joseph --- pkg/backup/actions/csi/pvc_action_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/backup/actions/csi/pvc_action_test.go b/pkg/backup/actions/csi/pvc_action_test.go index 1617075e4c..4c2b74399a 100644 --- a/pkg/backup/actions/csi/pvc_action_test.go +++ b/pkg/backup/actions/csi/pvc_action_test.go @@ -128,7 +128,7 @@ func TestExecute(t *testing.T) { pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(), pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(), sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(), - vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), + vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), skipVSReadyUpdate: true, expectedErr: errors.New("snapshot data movement requires a running node-agent daemonset; ensure node-agent is deployed and running: no running node-agent pods found"), }, From af7141cbb0f333a6375e4d3e9f2af36481e6b514 Mon Sep 17 00:00:00 2001 From: Joseph Date: Mon, 13 Apr 2026 14:25:22 -0400 Subject: [PATCH 07/11] Move node-agent check to backup validation The node-agent running check now lives in prepareBackupRequest() alongside other pre-flight validations (BSL availability, snapshot locations). This fails the entire backup with FailedValidation instead of producing per-PVC errors, and avoids creating snapshots that would need cleanup. Signed-off-by: Joseph --- changelogs/unreleased/9697-Joeavaikath | 2 +- pkg/backup/actions/csi/pvc_action.go | 11 ---- pkg/backup/actions/csi/pvc_action_test.go | 56 ------------------ pkg/controller/backup_controller.go | 13 +++++ pkg/controller/backup_controller_test.go | 69 +++++++++++++++++++++++ 5 files changed, 83 insertions(+), 68 deletions(-) diff --git a/changelogs/unreleased/9697-Joeavaikath b/changelogs/unreleased/9697-Joeavaikath index 425c854190..ad8e5eb2e2 100644 --- a/changelogs/unreleased/9697-Joeavaikath +++ b/changelogs/unreleased/9697-Joeavaikath @@ -1 +1 @@ -Add node-agent pod check to DataMover backup path +Fail backup validation when built-in data mover is requested but no node-agent pods are running diff --git a/pkg/backup/actions/csi/pvc_action.go b/pkg/backup/actions/csi/pvc_action.go index ce7528edd1..ac5f71a98e 100644 --- a/pkg/backup/actions/csi/pvc_action.go +++ b/pkg/backup/actions/csi/pvc_action.go @@ -47,10 +47,8 @@ import ( velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" velerov2alpha1 "github.com/vmware-tanzu/velero/pkg/apis/velero/v2alpha1" veleroclient "github.com/vmware-tanzu/velero/pkg/client" - "github.com/vmware-tanzu/velero/pkg/datamover" "github.com/vmware-tanzu/velero/pkg/kuberesource" "github.com/vmware-tanzu/velero/pkg/label" - "github.com/vmware-tanzu/velero/pkg/nodeagent" plugincommon "github.com/vmware-tanzu/velero/pkg/plugin/framework/common" "github.com/vmware-tanzu/velero/pkg/plugin/utils/volumehelper" "github.com/vmware-tanzu/velero/pkg/plugin/velero" @@ -337,15 +335,6 @@ func (p *pvcBackupItemAction) Execute( return nil, nil, "", nil, err } - // Fast-fail before creating the snapshot if snapshot data movement is - // requested with the built-in data mover but no node-agent pods are running. - if boolptr.IsSetToTrue(backup.Spec.SnapshotMoveData) && datamover.IsBuiltInUploader(backup.Spec.DataMover) { - if err := nodeagent.HasRunningPods(context.Background(), backup.Namespace, p.crClient); err != nil { - p.log.WithError(err).Error("cannot perform snapshot data movement without running node-agent pods") - return nil, nil, "", nil, errors.Wrap(err, "snapshot data movement requires a running node-agent daemonset; ensure node-agent is deployed and running") - } - } - vs, err := p.getVolumeSnapshotReference(context.TODO(), pvc, backup) if err != nil { return nil, nil, "", nil, err diff --git a/pkg/backup/actions/csi/pvc_action_test.go b/pkg/backup/actions/csi/pvc_action_test.go index 4c2b74399a..0ae72976fe 100644 --- a/pkg/backup/actions/csi/pvc_action_test.go +++ b/pkg/backup/actions/csi/pvc_action_test.go @@ -122,62 +122,6 @@ func TestExecute(t *testing.T) { skipVSReadyUpdate: true, // This will cause the timeout expectErr: true, // Expect an error, but the exact message can vary }, - { - name: "Fail SnapshotMoveData when no node-agent pods", - backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(), - pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(), - pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(), - sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(), - vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), - skipVSReadyUpdate: true, - expectedErr: errors.New("snapshot data movement requires a running node-agent daemonset; ensure node-agent is deployed and running: no running node-agent pods found"), - }, - { - name: "Skip node-agent check for custom data mover", - backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).DataMover("custom-mover").CSISnapshotTimeout(1 * time.Minute).Result(), - pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(), - pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(), - sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(), - vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(), - operationID: ".", - expectedDataUpload: &velerov2alpha1.DataUpload{ - TypeMeta: metav1.TypeMeta{ - Kind: "DataUpload", - APIVersion: velerov2alpha1.SchemeGroupVersion.String(), - }, - ObjectMeta: metav1.ObjectMeta{ - GenerateName: "test-", - Namespace: "velero", - Labels: map[string]string{ - velerov1api.BackupNameLabel: "test", - velerov1api.BackupUIDLabel: "", - velerov1api.PVCUIDLabel: "", - velerov1api.AsyncOperationIDLabel: "du-.", - }, - OwnerReferences: []metav1.OwnerReference{ - { - APIVersion: "velero.io/v1", - Kind: "Backup", - Name: "test", - UID: "", - Controller: &boolTrue, - }, - }, - }, - Spec: velerov2alpha1.DataUploadSpec{ - SnapshotType: velerov2alpha1.SnapshotTypeCSI, - CSISnapshot: &velerov2alpha1.CSISnapshotSpec{ - VolumeSnapshot: "", - StorageClass: "testSC", - SnapshotClass: "testVSClass", - }, - SourcePVC: "testPVC", - DataMover: "custom-mover", - SourceNamespace: "velero", - OperationTimeout: metav1.Duration{Duration: 1 * time.Minute}, - }, - }, - }, { name: "Test SnapshotMoveData", backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(), diff --git a/pkg/controller/backup_controller.go b/pkg/controller/backup_controller.go index 496308a6e0..f18bd77423 100644 --- a/pkg/controller/backup_controller.go +++ b/pkg/controller/backup_controller.go @@ -49,10 +49,12 @@ import ( velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1" pkgbackup "github.com/vmware-tanzu/velero/pkg/backup" "github.com/vmware-tanzu/velero/pkg/constant" + "github.com/vmware-tanzu/velero/pkg/datamover" "github.com/vmware-tanzu/velero/pkg/discovery" "github.com/vmware-tanzu/velero/pkg/features" "github.com/vmware-tanzu/velero/pkg/label" "github.com/vmware-tanzu/velero/pkg/metrics" + "github.com/vmware-tanzu/velero/pkg/nodeagent" "github.com/vmware-tanzu/velero/pkg/persistence" "github.com/vmware-tanzu/velero/pkg/plugin/clientmgmt" "github.com/vmware-tanzu/velero/pkg/plugin/framework" @@ -494,6 +496,17 @@ func (b *backupReconciler) prepareBackupRequest(ctx context.Context, backup *vel } } + // validate that node-agent pods are running when snapshot data movement with the + // built-in data mover is requested. Without this, the DataUpload CR will be created + // but never processed (the DataUpload controller runs inside node-agent), causing the + // backup to hang until itemOperationTimeout expires. + if boolptr.IsSetToTrue(request.Spec.SnapshotMoveData) && datamover.IsBuiltInUploader(request.Spec.DataMover) { + if err := nodeagent.HasRunningPods(ctx, request.Namespace, b.kbClient); err != nil { + request.Status.ValidationErrors = append(request.Status.ValidationErrors, + "no running node-agent pods found; the built-in data mover requires node-agent to be deployed") + } + } + // Getting all information of cluster version - useful for future skip-level migration if request.Annotations == nil { request.Annotations = make(map[string]string) diff --git a/pkg/controller/backup_controller_test.go b/pkg/controller/backup_controller_test.go index 3864989002..66d5c2aab3 100644 --- a/pkg/controller/backup_controller_test.go +++ b/pkg/controller/backup_controller_test.go @@ -33,6 +33,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" + corev1api "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -492,6 +493,74 @@ func TestDefaultBackupTTL(t *testing.T) { } } +func TestPrepareBackupRequest_NodeAgentValidation(t *testing.T) { + now, err := time.Parse(time.RFC1123Z, time.RFC1123Z) + require.NoError(t, err) + + nodeAgentPod := builder.ForPod(velerov1api.DefaultNamespace, "node-agent-abc"). + Labels(map[string]string{"role": "node-agent"}). + NodeName("worker-1"). + Phase(corev1api.PodRunning). + Result() + + tests := []struct { + name string + backup *velerov1api.Backup + objs []runtime.Object + expectedValidationError string + }{ + { + name: "snapshotMoveData with no node-agent pods", + backup: defaultBackup().SnapshotMoveData(true).Result(), + expectedValidationError: "no running node-agent pods found; the built-in data mover requires node-agent to be deployed", + }, + { + name: "snapshotMoveData with running node-agent pod", + backup: defaultBackup().SnapshotMoveData(true).Result(), + objs: []runtime.Object{nodeAgentPod}, + }, + { + name: "snapshotMoveData with custom data mover skips check", + backup: defaultBackup().SnapshotMoveData(true).DataMover("custom-mover").Result(), + }, + { + name: "snapshotMoveData disabled skips check", + backup: defaultBackup().Result(), + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + formatFlag := logging.FormatText + logger := logging.DefaultLogger(logrus.DebugLevel, formatFlag) + apiServer := velerotest.NewAPIServer(t) + discoveryHelper, err := discovery.NewHelper(apiServer.DiscoveryClient, logger) + require.NoError(t, err) + + fakeClient := velerotest.NewFakeControllerRuntimeClient(t, test.objs...) + + c := &backupReconciler{ + discoveryHelper: discoveryHelper, + kbClient: fakeClient, + defaultBackupTTL: 24 * 30 * time.Hour, + clock: testclocks.NewFakeClock(now), + formatFlag: formatFlag, + } + + res := c.prepareBackupRequest(ctx, test.backup, logger) + defer res.WorkerPool.Stop() + + if test.expectedValidationError != "" { + assert.Contains(t, res.Status.ValidationErrors, test.expectedValidationError) + } else { + for _, e := range res.Status.ValidationErrors { + assert.NotContains(t, e, "node-agent") + } + } + }) + } +} + func TestPrepareBackupRequest_SetsVGSLabelKey(t *testing.T) { now, err := time.Parse(time.RFC1123Z, time.RFC1123Z) require.NoError(t, err) From 0a164d2cc5429be490650315fc1204a85526eaf5 Mon Sep 17 00:00:00 2001 From: Joseph Date: Mon, 13 Apr 2026 16:55:03 -0400 Subject: [PATCH 08/11] Add node-agent pod to TestProcessBackupCompletions fake client The node-agent validation added in prepareBackupRequest causes test cases with SnapshotMoveData enabled to fail validation. Add a running node-agent pod to the baseline test environment so all cases pass the check. Signed-off-by: Joseph --- pkg/controller/backup_controller_test.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pkg/controller/backup_controller_test.go b/pkg/controller/backup_controller_test.go index 66d5c2aab3..70bc5e4eb1 100644 --- a/pkg/controller/backup_controller_test.go +++ b/pkg/controller/backup_controller_test.go @@ -740,6 +740,15 @@ func TestProcessBackupCompletions(t *testing.T) { now = now.Local() timestamp := metav1.NewTime(now) + // Node-agent pod is needed for all test cases so that the node-agent + // validation in prepareBackupRequest does not reject backups that use + // snapshot data movement. + nodeAgentPod := builder.ForPod(velerov1api.DefaultNamespace, "node-agent-abc"). + Labels(map[string]string{"role": "node-agent"}). + NodeName("worker-1"). + Phase(corev1api.PodRunning). + Result() + tests := []struct { name string backup *velerov1api.Backup @@ -1584,6 +1593,7 @@ func TestProcessBackupCompletions(t *testing.T) { builder.ForVolumeSnapshotContent("testVSC").ObjectMeta(builder.WithLabels(velerov1api.BackupNameLabel, "backup-1")).VolumeSnapshotClassName("testClass").Status(&snapshotv1api.VolumeSnapshotContentStatus{ SnapshotHandle: &snapshotHandle, }).Result(), + nodeAgentPod, ) } else { fakeClient = velerotest.NewFakeControllerRuntimeClient(t, @@ -1591,6 +1601,7 @@ func TestProcessBackupCompletions(t *testing.T) { builder.ForVolumeSnapshotContent("testVSC").ObjectMeta(builder.WithLabels(velerov1api.BackupNameLabel, "backup-1")).VolumeSnapshotClassName("testClass").Status(&snapshotv1api.VolumeSnapshotContentStatus{ SnapshotHandle: &snapshotHandle, }).Result(), + nodeAgentPod, ) } From c7b65cf2d2ee38f9ce6d3bdd3601899fb4264dcd Mon Sep 17 00:00:00 2001 From: Joseph Date: Mon, 20 Apr 2026 13:28:11 -0400 Subject: [PATCH 09/11] Fix filename Signed-off-by: Joseph --- changelogs/unreleased/9683-Lyndon-Li | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelogs/unreleased/9683-Lyndon-Li diff --git a/changelogs/unreleased/9683-Lyndon-Li b/changelogs/unreleased/9683-Lyndon-Li new file mode 100644 index 0000000000..35ae4d5cb2 --- /dev/null +++ b/changelogs/unreleased/9683-Lyndon-Li @@ -0,0 +1 @@ +Fix issue #9659, in the case that PVB/PVR/DU/DD is cancelled before the data path is really started, call EndEvent to prevent data mover pod from crashing because of delay event distribution \ No newline at end of file From 8d559def9e002b1ea44f56a91708c26c8cdccc25 Mon Sep 17 00:00:00 2001 From: Joseph Date: Mon, 20 Apr 2026 13:28:37 -0400 Subject: [PATCH 10/11] Remove file Signed-off-by: Joseph --- "changelogs/unreleased/9663-Lyndon-Li\342\200\216\342\200\216" | 1 - 1 file changed, 1 deletion(-) delete mode 100644 "changelogs/unreleased/9663-Lyndon-Li\342\200\216\342\200\216" diff --git "a/changelogs/unreleased/9663-Lyndon-Li\342\200\216\342\200\216" "b/changelogs/unreleased/9663-Lyndon-Li\342\200\216\342\200\216" deleted file mode 100644 index 35ae4d5cb2..0000000000 --- "a/changelogs/unreleased/9663-Lyndon-Li\342\200\216\342\200\216" +++ /dev/null @@ -1 +0,0 @@ -Fix issue #9659, in the case that PVB/PVR/DU/DD is cancelled before the data path is really started, call EndEvent to prevent data mover pod from crashing because of delay event distribution \ No newline at end of file From fbe39987c83134468fd42813527ab56588991efe Mon Sep 17 00:00:00 2001 From: Joseph Date: Tue, 5 May 2026 19:41:37 -0400 Subject: [PATCH 11/11] Check node-agent DaemonSet status instead of listing pods Replace HasRunningPods (which listed all node-agent pods and iterated to find a running one) with IsReady, which does a single Get on the node-agent DaemonSet and checks status.numberReady > 0. This is a cheaper check and more semantically appropriate since we only need to know the DaemonSet is healthy, not inspect individual pods. Signed-off-by: Joseph --- pkg/controller/backup_controller.go | 12 ++-- pkg/controller/backup_controller_test.go | 75 ++++++++++++++++-------- pkg/nodeagent/node_agent.go | 28 +++------ pkg/nodeagent/node_agent_test.go | 52 +++++++--------- 4 files changed, 87 insertions(+), 80 deletions(-) diff --git a/pkg/controller/backup_controller.go b/pkg/controller/backup_controller.go index f18bd77423..51bbd9850b 100644 --- a/pkg/controller/backup_controller.go +++ b/pkg/controller/backup_controller.go @@ -496,14 +496,14 @@ func (b *backupReconciler) prepareBackupRequest(ctx context.Context, backup *vel } } - // validate that node-agent pods are running when snapshot data movement with the - // built-in data mover is requested. Without this, the DataUpload CR will be created - // but never processed (the DataUpload controller runs inside node-agent), causing the - // backup to hang until itemOperationTimeout expires. + // validate that the node-agent daemonset is ready when snapshot data movement with + // the built-in data mover is requested. Without this, the DataUpload CR will be + // created but never processed (the DataUpload controller runs inside node-agent), + // causing the backup to hang until itemOperationTimeout expires. if boolptr.IsSetToTrue(request.Spec.SnapshotMoveData) && datamover.IsBuiltInUploader(request.Spec.DataMover) { - if err := nodeagent.HasRunningPods(ctx, request.Namespace, b.kbClient); err != nil { + if err := nodeagent.IsReady(ctx, request.Namespace, b.kbClient); err != nil { request.Status.ValidationErrors = append(request.Status.ValidationErrors, - "no running node-agent pods found; the built-in data mover requires node-agent to be deployed") + fmt.Sprintf("node-agent is not ready: %v; the built-in data mover requires node-agent to be deployed and running", err)) } } diff --git a/pkg/controller/backup_controller_test.go b/pkg/controller/backup_controller_test.go index 70bc5e4eb1..313a71b251 100644 --- a/pkg/controller/backup_controller_test.go +++ b/pkg/controller/backup_controller_test.go @@ -33,7 +33,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" - corev1api "k8s.io/api/core/v1" + appsv1api "k8s.io/api/apps/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" @@ -497,27 +497,43 @@ func TestPrepareBackupRequest_NodeAgentValidation(t *testing.T) { now, err := time.Parse(time.RFC1123Z, time.RFC1123Z) require.NoError(t, err) - nodeAgentPod := builder.ForPod(velerov1api.DefaultNamespace, "node-agent-abc"). - Labels(map[string]string{"role": "node-agent"}). - NodeName("worker-1"). - Phase(corev1api.PodRunning). - Result() + nodeAgentDS := &appsv1api.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: velerov1api.DefaultNamespace, + Name: "node-agent", + }, + Status: appsv1api.DaemonSetStatus{NumberReady: 3}, + } + + nodeAgentDSNotReady := &appsv1api.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: velerov1api.DefaultNamespace, + Name: "node-agent", + }, + Status: appsv1api.DaemonSetStatus{NumberReady: 0}, + } tests := []struct { - name string - backup *velerov1api.Backup - objs []runtime.Object - expectedValidationError string + name string + backup *velerov1api.Backup + objs []runtime.Object + expectNodeAgent bool }{ { - name: "snapshotMoveData with no node-agent pods", - backup: defaultBackup().SnapshotMoveData(true).Result(), - expectedValidationError: "no running node-agent pods found; the built-in data mover requires node-agent to be deployed", + name: "snapshotMoveData with no node-agent daemonset", + backup: defaultBackup().SnapshotMoveData(true).Result(), + expectNodeAgent: true, }, { - name: "snapshotMoveData with running node-agent pod", + name: "snapshotMoveData with node-agent daemonset not ready", + backup: defaultBackup().SnapshotMoveData(true).Result(), + objs: []runtime.Object{nodeAgentDSNotReady}, + expectNodeAgent: true, + }, + { + name: "snapshotMoveData with ready node-agent daemonset", backup: defaultBackup().SnapshotMoveData(true).Result(), - objs: []runtime.Object{nodeAgentPod}, + objs: []runtime.Object{nodeAgentDS}, }, { name: "snapshotMoveData with custom data mover skips check", @@ -550,8 +566,15 @@ func TestPrepareBackupRequest_NodeAgentValidation(t *testing.T) { res := c.prepareBackupRequest(ctx, test.backup, logger) defer res.WorkerPool.Stop() - if test.expectedValidationError != "" { - assert.Contains(t, res.Status.ValidationErrors, test.expectedValidationError) + if test.expectNodeAgent { + hasNodeAgentError := false + for _, e := range res.Status.ValidationErrors { + if strings.Contains(e, "node-agent") { + hasNodeAgentError = true + break + } + } + assert.True(t, hasNodeAgentError, "expected a node-agent validation error") } else { for _, e := range res.Status.ValidationErrors { assert.NotContains(t, e, "node-agent") @@ -740,14 +763,16 @@ func TestProcessBackupCompletions(t *testing.T) { now = now.Local() timestamp := metav1.NewTime(now) - // Node-agent pod is needed for all test cases so that the node-agent + // Node-agent daemonset is needed for all test cases so that the node-agent // validation in prepareBackupRequest does not reject backups that use // snapshot data movement. - nodeAgentPod := builder.ForPod(velerov1api.DefaultNamespace, "node-agent-abc"). - Labels(map[string]string{"role": "node-agent"}). - NodeName("worker-1"). - Phase(corev1api.PodRunning). - Result() + nodeAgentDS := &appsv1api.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: velerov1api.DefaultNamespace, + Name: "node-agent", + }, + Status: appsv1api.DaemonSetStatus{NumberReady: 3}, + } tests := []struct { name string @@ -1593,7 +1618,7 @@ func TestProcessBackupCompletions(t *testing.T) { builder.ForVolumeSnapshotContent("testVSC").ObjectMeta(builder.WithLabels(velerov1api.BackupNameLabel, "backup-1")).VolumeSnapshotClassName("testClass").Status(&snapshotv1api.VolumeSnapshotContentStatus{ SnapshotHandle: &snapshotHandle, }).Result(), - nodeAgentPod, + nodeAgentDS, ) } else { fakeClient = velerotest.NewFakeControllerRuntimeClient(t, @@ -1601,7 +1626,7 @@ func TestProcessBackupCompletions(t *testing.T) { builder.ForVolumeSnapshotContent("testVSC").ObjectMeta(builder.WithLabels(velerov1api.BackupNameLabel, "backup-1")).VolumeSnapshotClassName("testClass").Status(&snapshotv1api.VolumeSnapshotContentStatus{ SnapshotHandle: &snapshotHandle, }).Result(), - nodeAgentPod, + nodeAgentDS, ) } diff --git a/pkg/nodeagent/node_agent.go b/pkg/nodeagent/node_agent.go index cc616ba61a..ed93926aec 100644 --- a/pkg/nodeagent/node_agent.go +++ b/pkg/nodeagent/node_agent.go @@ -22,6 +22,7 @@ import ( "fmt" "github.com/pkg/errors" + appsv1api "k8s.io/api/apps/v1" corev1api "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -80,29 +81,18 @@ func KbClientIsRunningInNode(ctx context.Context, namespace string, nodeName str return isRunningInNode(ctx, namespace, nodeName, nil, kubeClient) } -// HasRunningPods checks if any node agent pod is running in the namespace through controller client. If not, return the error found. -func HasRunningPods(ctx context.Context, namespace string, crClient ctrlclient.Client) error { - pods := new(corev1api.PodList) - parsedSelector, err := labels.Parse(fmt.Sprintf("role=%s", nodeAgentRole)) - if err != nil { - return errors.Wrap(err, "fail to parse selector") +// IsReady checks whether the node-agent daemonset has at least one ready pod by inspecting the DaemonSet status. +func IsReady(ctx context.Context, namespace string, crClient ctrlclient.Client) error { + ds := new(appsv1api.DaemonSet) + if err := crClient.Get(ctx, ctrlclient.ObjectKey{Namespace: namespace, Name: daemonSet}, ds); err != nil { + return errors.Wrap(err, "failed to get node-agent daemonset") } - err = crClient.List(ctx, pods, &ctrlclient.ListOptions{ - LabelSelector: parsedSelector, - Namespace: namespace, - }) - if err != nil { - return errors.Wrap(err, "failed to list node-agent pods") - } - - for i := range pods.Items { - if kube.IsPodRunning(&pods.Items[i]) == nil { - return nil - } + if ds.Status.NumberReady == 0 { + return errors.New("node-agent daemonset has no ready pods") } - return errors.New("no running node-agent pods found") + return nil } // IsRunningInNode checks if the node agent pod is running properly in a specified node through controller client. If not, return the error found diff --git a/pkg/nodeagent/node_agent_test.go b/pkg/nodeagent/node_agent_test.go index 49f7e7f175..ede4e9f85c 100644 --- a/pkg/nodeagent/node_agent_test.go +++ b/pkg/nodeagent/node_agent_test.go @@ -213,17 +213,18 @@ func TestIsRunningInNode(t *testing.T) { } } -func TestHasRunningPods(t *testing.T) { +func TestIsReady(t *testing.T) { scheme := runtime.NewScheme() - corev1api.AddToScheme(scheme) + appsv1api.AddToScheme(scheme) - nonNodeAgentPod := builder.ForPod("fake-ns", "fake-pod").Result() - nodeAgentPodNotRunning := builder.ForPod("fake-ns", "fake-pod-pending").Labels(map[string]string{"role": "node-agent"}).Result() - nodeAgentPodRunning := builder.ForPod("fake-ns", "fake-pod-running"). - Labels(map[string]string{"role": "node-agent"}). - Phase(corev1api.PodRunning). - NodeName("fake-node"). - Result() + dsNotReady := &appsv1api.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Namespace: "fake-ns", Name: "node-agent"}, + Status: appsv1api.DaemonSetStatus{NumberReady: 0}, + } + dsReady := &appsv1api.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{Namespace: "fake-ns", Name: "node-agent"}, + Status: appsv1api.DaemonSetStatus{NumberReady: 3}, + } tests := []struct { name string @@ -232,44 +233,35 @@ func TestHasRunningPods(t *testing.T) { expectErr string }{ { - name: "no pods at all", - namespace: "fake-ns", - expectErr: "no running node-agent pods found", - }, - { - name: "only non-node-agent pods", + name: "daemonset not found", namespace: "fake-ns", - kubeClientObj: []runtime.Object{ - nonNodeAgentPod, - }, - expectErr: "no running node-agent pods found", + expectErr: "failed to get node-agent daemonset: daemonsets.apps \"node-agent\" not found", }, { - name: "node-agent pods exist but none running", + name: "daemonset exists but no ready pods", namespace: "fake-ns", kubeClientObj: []runtime.Object{ - nodeAgentPodNotRunning, + dsNotReady, }, - expectErr: "no running node-agent pods found", + expectErr: "node-agent daemonset has no ready pods", }, { - name: "at least one running node-agent pod", + name: "daemonset has ready pods", namespace: "fake-ns", kubeClientObj: []runtime.Object{ - nodeAgentPodNotRunning, - nodeAgentPodRunning, + dsReady, }, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - fakeClientBuilder := clientFake.NewClientBuilder() - fakeClientBuilder = fakeClientBuilder.WithScheme(scheme) - - fakeClient := fakeClientBuilder.WithRuntimeObjects(test.kubeClientObj...).Build() + fakeClient := clientFake.NewClientBuilder(). + WithScheme(scheme). + WithRuntimeObjects(test.kubeClientObj...). + Build() - err := HasRunningPods(t.Context(), test.namespace, fakeClient) + err := IsReady(t.Context(), test.namespace, fakeClient) if test.expectErr == "" { assert.NoError(t, err) } else {