Skip to content
1 change: 1 addition & 0 deletions changelogs/unreleased/9697-Joeavaikath
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fail backup validation when built-in data mover is requested but no node-agent pods are running
32 changes: 20 additions & 12 deletions pkg/backup/actions/csi/pvc_action_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ func TestExecute(t *testing.T) {
expectedDataUpload *velerov2alpha1.DataUpload
expectedPVC *corev1api.PersistentVolumeClaim
resourcePolicy *corev1api.ConfigMap
extraObjects []runtime.Object
failVSCreate bool
skipVSReadyUpdate bool // New flag to control VS readiness
}{
Expand Down Expand Up @@ -122,12 +123,15 @@ func TestExecute(t *testing.T) {
expectErr: true, // Expect an error, but the exact message can vary
},
{
name: "Test SnapshotMoveData",
backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(),
pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(),
pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(),
sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(),
vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(),
name: "Test SnapshotMoveData",
backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(),
pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(),
pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(),
sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(),
vsClass: builder.ForVolumeSnapshotClass("testVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(),
extraObjects: []runtime.Object{
builder.ForPod("velero", "node-agent-pod").Labels(map[string]string{"role": "node-agent"}).Phase(corev1api.PodRunning).NodeName("test-node").Result(),
},
operationID: ".",
expectedDataUpload: &velerov2alpha1.DataUpload{
TypeMeta: metav1.TypeMeta{
Expand Down Expand Up @@ -167,12 +171,15 @@ func TestExecute(t *testing.T) {
},
},
{
name: "Verify PVC is modified as expected",
backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(),
pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(),
pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(),
sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(),
vsClass: builder.ForVolumeSnapshotClass("tescVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(),
name: "Verify PVC is modified as expected",
backup: builder.ForBackup("velero", "test").SnapshotMoveData(true).CSISnapshotTimeout(1 * time.Minute).Result(),
pvc: builder.ForPersistentVolumeClaim("velero", "testPVC").VolumeName("testPV").StorageClass("testSC").Phase(corev1api.ClaimBound).Result(),
pv: builder.ForPersistentVolume("testPV").CSI("hostpath", "testVolume").Result(),
sc: builder.ForStorageClass("testSC").Provisioner("hostpath").Result(),
vsClass: builder.ForVolumeSnapshotClass("tescVSClass").Driver("hostpath").ObjectMeta(builder.WithLabels(velerov1api.VolumeSnapshotClassSelectorLabel, "")).Result(),
extraObjects: []runtime.Object{
builder.ForPod("velero", "node-agent-pod").Labels(map[string]string{"role": "node-agent"}).Phase(corev1api.PodRunning).NodeName("test-node").Result(),
},
operationID: ".",
expectedPVC: builder.ForPersistentVolumeClaim("velero", "testPVC").
ObjectMeta(builder.WithAnnotations(velerov1api.MustIncludeAdditionalItemAnnotation, "true", velerov1api.DataUploadNameAnnotation, "velero/"),
Expand Down Expand Up @@ -210,6 +217,7 @@ func TestExecute(t *testing.T) {
if tc.resourcePolicy != nil {
objects = append(objects, tc.resourcePolicy)
}
objects = append(objects, tc.extraObjects...)

var crClient crclient.Client
if tc.failVSCreate {
Expand Down
13 changes: 13 additions & 0 deletions pkg/controller/backup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,12 @@ import (
velerov1api "github.com/vmware-tanzu/velero/pkg/apis/velero/v1"
pkgbackup "github.com/vmware-tanzu/velero/pkg/backup"
"github.com/vmware-tanzu/velero/pkg/constant"
"github.com/vmware-tanzu/velero/pkg/datamover"
"github.com/vmware-tanzu/velero/pkg/discovery"
"github.com/vmware-tanzu/velero/pkg/features"
"github.com/vmware-tanzu/velero/pkg/label"
"github.com/vmware-tanzu/velero/pkg/metrics"
"github.com/vmware-tanzu/velero/pkg/nodeagent"
"github.com/vmware-tanzu/velero/pkg/persistence"
"github.com/vmware-tanzu/velero/pkg/plugin/clientmgmt"
"github.com/vmware-tanzu/velero/pkg/plugin/framework"
Expand Down Expand Up @@ -494,6 +496,17 @@ func (b *backupReconciler) prepareBackupRequest(ctx context.Context, backup *vel
}
}

// validate that the node-agent daemonset is ready when snapshot data movement with
// the built-in data mover is requested. Without this, the DataUpload CR will be
// created but never processed (the DataUpload controller runs inside node-agent),
// causing the backup to hang until itemOperationTimeout expires.
if boolptr.IsSetToTrue(request.Spec.SnapshotMoveData) && datamover.IsBuiltInUploader(request.Spec.DataMover) {
if err := nodeagent.IsReady(ctx, request.Namespace, b.kbClient); err != nil {
request.Status.ValidationErrors = append(request.Status.ValidationErrors,
fmt.Sprintf("node-agent is not ready: %v; the built-in data mover requires node-agent to be deployed and running", err))
}
}

// Getting all information of cluster version - useful for future skip-level migration
if request.Annotations == nil {
request.Annotations = make(map[string]string)
Expand Down
105 changes: 105 additions & 0 deletions pkg/controller/backup_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
appsv1api "k8s.io/api/apps/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
Expand Down Expand Up @@ -492,6 +493,97 @@ func TestDefaultBackupTTL(t *testing.T) {
}
}

func TestPrepareBackupRequest_NodeAgentValidation(t *testing.T) {
now, err := time.Parse(time.RFC1123Z, time.RFC1123Z)
require.NoError(t, err)

nodeAgentDS := &appsv1api.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Namespace: velerov1api.DefaultNamespace,
Name: "node-agent",
},
Status: appsv1api.DaemonSetStatus{NumberReady: 3},
}

nodeAgentDSNotReady := &appsv1api.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Namespace: velerov1api.DefaultNamespace,
Name: "node-agent",
},
Status: appsv1api.DaemonSetStatus{NumberReady: 0},
}

tests := []struct {
name string
backup *velerov1api.Backup
objs []runtime.Object
expectNodeAgent bool
}{
{
name: "snapshotMoveData with no node-agent daemonset",
backup: defaultBackup().SnapshotMoveData(true).Result(),
expectNodeAgent: true,
},
{
name: "snapshotMoveData with node-agent daemonset not ready",
backup: defaultBackup().SnapshotMoveData(true).Result(),
objs: []runtime.Object{nodeAgentDSNotReady},
expectNodeAgent: true,
},
{
name: "snapshotMoveData with ready node-agent daemonset",
backup: defaultBackup().SnapshotMoveData(true).Result(),
objs: []runtime.Object{nodeAgentDS},
},
{
name: "snapshotMoveData with custom data mover skips check",
backup: defaultBackup().SnapshotMoveData(true).DataMover("custom-mover").Result(),
},
{
name: "snapshotMoveData disabled skips check",
backup: defaultBackup().Result(),
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
formatFlag := logging.FormatText
logger := logging.DefaultLogger(logrus.DebugLevel, formatFlag)
apiServer := velerotest.NewAPIServer(t)
discoveryHelper, err := discovery.NewHelper(apiServer.DiscoveryClient, logger)
require.NoError(t, err)

fakeClient := velerotest.NewFakeControllerRuntimeClient(t, test.objs...)

c := &backupReconciler{
discoveryHelper: discoveryHelper,
kbClient: fakeClient,
defaultBackupTTL: 24 * 30 * time.Hour,
clock: testclocks.NewFakeClock(now),
formatFlag: formatFlag,
}

res := c.prepareBackupRequest(ctx, test.backup, logger)
defer res.WorkerPool.Stop()

if test.expectNodeAgent {
hasNodeAgentError := false
for _, e := range res.Status.ValidationErrors {
if strings.Contains(e, "node-agent") {
hasNodeAgentError = true
break
}
}
assert.True(t, hasNodeAgentError, "expected a node-agent validation error")
} else {
for _, e := range res.Status.ValidationErrors {
assert.NotContains(t, e, "node-agent")
}
}
})
}
}

func TestPrepareBackupRequest_SetsVGSLabelKey(t *testing.T) {
now, err := time.Parse(time.RFC1123Z, time.RFC1123Z)
require.NoError(t, err)
Expand Down Expand Up @@ -671,6 +763,17 @@ func TestProcessBackupCompletions(t *testing.T) {
now = now.Local()
timestamp := metav1.NewTime(now)

// Node-agent daemonset is needed for all test cases so that the node-agent
// validation in prepareBackupRequest does not reject backups that use
// snapshot data movement.
nodeAgentDS := &appsv1api.DaemonSet{
ObjectMeta: metav1.ObjectMeta{
Namespace: velerov1api.DefaultNamespace,
Name: "node-agent",
},
Status: appsv1api.DaemonSetStatus{NumberReady: 3},
}

tests := []struct {
name string
backup *velerov1api.Backup
Expand Down Expand Up @@ -1515,13 +1618,15 @@ func TestProcessBackupCompletions(t *testing.T) {
builder.ForVolumeSnapshotContent("testVSC").ObjectMeta(builder.WithLabels(velerov1api.BackupNameLabel, "backup-1")).VolumeSnapshotClassName("testClass").Status(&snapshotv1api.VolumeSnapshotContentStatus{
SnapshotHandle: &snapshotHandle,
}).Result(),
nodeAgentDS,
)
} else {
fakeClient = velerotest.NewFakeControllerRuntimeClient(t,
builder.ForVolumeSnapshotClass("testClass").Driver("testDriver").Result(),
builder.ForVolumeSnapshotContent("testVSC").ObjectMeta(builder.WithLabels(velerov1api.BackupNameLabel, "backup-1")).VolumeSnapshotClassName("testClass").Status(&snapshotv1api.VolumeSnapshotContentStatus{
SnapshotHandle: &snapshotHandle,
}).Result(),
nodeAgentDS,
)
}

Expand Down
15 changes: 15 additions & 0 deletions pkg/nodeagent/node_agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"fmt"

"github.com/pkg/errors"
appsv1api "k8s.io/api/apps/v1"
corev1api "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -80,6 +81,20 @@ func KbClientIsRunningInNode(ctx context.Context, namespace string, nodeName str
return isRunningInNode(ctx, namespace, nodeName, nil, kubeClient)
}

// IsReady checks whether the node-agent daemonset has at least one ready pod by inspecting the DaemonSet status.
func IsReady(ctx context.Context, namespace string, crClient ctrlclient.Client) error {
ds := new(appsv1api.DaemonSet)
if err := crClient.Get(ctx, ctrlclient.ObjectKey{Namespace: namespace, Name: daemonSet}, ds); err != nil {
return errors.Wrap(err, "failed to get node-agent daemonset")
}

if ds.Status.NumberReady == 0 {
return errors.New("node-agent daemonset has no ready pods")
}

return nil
}

// IsRunningInNode checks if the node agent pod is running properly in a specified node through controller client. If not, return the error found
func IsRunningInNode(ctx context.Context, namespace string, nodeName string, crClient ctrlclient.Client) error {
return isRunningInNode(ctx, namespace, nodeName, crClient, nil)
Expand Down
58 changes: 58 additions & 0 deletions pkg/nodeagent/node_agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,64 @@ func TestIsRunningInNode(t *testing.T) {
}
}

func TestIsReady(t *testing.T) {
scheme := runtime.NewScheme()
appsv1api.AddToScheme(scheme)

dsNotReady := &appsv1api.DaemonSet{
ObjectMeta: metav1.ObjectMeta{Namespace: "fake-ns", Name: "node-agent"},
Status: appsv1api.DaemonSetStatus{NumberReady: 0},
}
dsReady := &appsv1api.DaemonSet{
ObjectMeta: metav1.ObjectMeta{Namespace: "fake-ns", Name: "node-agent"},
Status: appsv1api.DaemonSetStatus{NumberReady: 3},
}

tests := []struct {
name string
kubeClientObj []runtime.Object
namespace string
expectErr string
}{
{
name: "daemonset not found",
namespace: "fake-ns",
expectErr: "failed to get node-agent daemonset: daemonsets.apps \"node-agent\" not found",
},
{
name: "daemonset exists but no ready pods",
namespace: "fake-ns",
kubeClientObj: []runtime.Object{
dsNotReady,
},
expectErr: "node-agent daemonset has no ready pods",
},
{
name: "daemonset has ready pods",
namespace: "fake-ns",
kubeClientObj: []runtime.Object{
dsReady,
},
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
fakeClient := clientFake.NewClientBuilder().
WithScheme(scheme).
WithRuntimeObjects(test.kubeClientObj...).
Build()

err := IsReady(t.Context(), test.namespace, fakeClient)
if test.expectErr == "" {
assert.NoError(t, err)
} else {
assert.EqualError(t, err, test.expectErr)
}
})
}
}

func TestGetPodSpec(t *testing.T) {
podSpec := corev1api.PodSpec{
NodeName: "fake-node",
Expand Down
Loading