diff --git a/changelogs/unreleased/9675-priyansh17 b/changelogs/unreleased/9675-priyansh17 new file mode 100644 index 0000000000..add6002cf7 --- /dev/null +++ b/changelogs/unreleased/9675-priyansh17 @@ -0,0 +1 @@ +Fix backup stuck in WaitingForPluginOperations when CSI VolumeSnapshot has a persistent error beyond CSISnapshotTimeout diff --git a/pkg/backup/actions/csi/volumesnapshot_action.go b/pkg/backup/actions/csi/volumesnapshot_action.go index 0e0e9a8404..23d547cc1e 100644 --- a/pkg/backup/actions/csi/volumesnapshot_action.go +++ b/pkg/backup/actions/csi/volumesnapshot_action.go @@ -300,8 +300,21 @@ func (p *volumeSnapshotBackupItemAction) Progress( if vs.Status.Error.Message != nil { errorMessage = *vs.Status.Error.Message } - p.log.Warnf("VolumeSnapshot has a temporary error %s. Snapshot controller will retry later.", - errorMessage) + + timeout := backup.Spec.CSISnapshotTimeout.Duration + if timeout > 0 && time.Since(progress.Started) >= timeout { + p.log.Errorf( + "VolumeSnapshot %s/%s has a persistent error beyond CSISnapshotTimeout (%s): %s", + vs.Namespace, vs.Name, timeout, errorMessage) + progress.Completed = true + progress.Updated = time.Now() + progress.Err = fmt.Sprintf("VolumeSnapshot %s/%s has a persistent error: %s", + vs.Namespace, vs.Name, errorMessage) + return progress, nil + } + + p.log.Warnf("VolumeSnapshot %s/%s has an error within the CSISnapshotTimeout window: %s. Snapshot controller will retry later.", + vs.Namespace, vs.Name, errorMessage) return progress, nil } @@ -331,12 +344,24 @@ func (p *volumeSnapshotBackupItemAction) Progress( progress.Completed = true progress.Updated = now } else if vsc.Status.Error != nil { - progress.Completed = true - progress.Updated = now + errorMessage := "" if vsc.Status.Error.Message != nil { - progress.Err = *vsc.Status.Error.Message + errorMessage = *vsc.Status.Error.Message + } + + timeout := backup.Spec.CSISnapshotTimeout.Duration + if timeout > 0 && time.Since(progress.Started) >= timeout { + p.log.Errorf( + "VolumeSnapshotContent %s has a persistent error beyond CSISnapshotTimeout (%s): %s", + vsc.Name, timeout, errorMessage) + progress.Completed = true + progress.Updated = now + progress.Err = fmt.Sprintf("VolumeSnapshotContent %s has a persistent error: %s", + vsc.Name, errorMessage) + } else { + p.log.Warnf("VolumeSnapshotContent %s has an error within the CSISnapshotTimeout window: %s. Snapshot controller will retry later.", + vsc.Name, errorMessage) } - p.log.Warnf("VolumeSnapshotContent meets an error %s.", progress.Err) } } diff --git a/pkg/backup/actions/csi/volumesnapshot_action_test.go b/pkg/backup/actions/csi/volumesnapshot_action_test.go index 1db5cd4753..1a1779c17a 100644 --- a/pkg/backup/actions/csi/volumesnapshot_action_test.go +++ b/pkg/backup/actions/csi/volumesnapshot_action_test.go @@ -19,6 +19,7 @@ package csi import ( "fmt" "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" @@ -219,7 +220,7 @@ func TestVSProgress(t *testing.T) { expectedErr: false, }, { - name: "VS status has error", + name: "VS status has error, no prior annotation, no CSISnapshotTimeout configured", operationID: "ns/name/2024-04-11T18:49:00+08:00", vs: builder.ForVolumeSnapshot("ns", "name").Status(). StatusError(snapshotv1api.VolumeSnapshotError{ @@ -228,6 +229,30 @@ func TestVSProgress(t *testing.T) { backup: builder.ForBackup("velero", "backup").Result(), expectedErr: false, }, + { + name: "VS status has error, within CSISnapshotTimeout (recent start time)", + operationID: "ns/name/" + time.Now().Format(time.RFC3339), + vs: builder.ForVolumeSnapshot("ns", "name").Status(). + StatusError(snapshotv1api.VolumeSnapshotError{ + Message: &errorStr, + }).Result(), + backup: builder.ForBackup("velero", "backup").CSISnapshotTimeout(10 * time.Minute).Result(), + expectedErr: false, + }, + { + name: "VS status has persistent error beyond CSISnapshotTimeout", + operationID: "ns/name/2024-04-11T18:49:00+08:00", + vs: builder.ForVolumeSnapshot("ns", "name").Status(). + StatusError(snapshotv1api.VolumeSnapshotError{ + Message: &errorStr, + }).Result(), + backup: builder.ForBackup("velero", "backup").CSISnapshotTimeout(10 * time.Minute).Result(), + expectedErr: false, + expectedProgress: &velero.OperationProgress{ + Completed: true, + Err: fmt.Sprintf("VolumeSnapshot ns/name has a persistent error: %s", errorStr), + }, + }, { name: "Fail to get VSC", operationID: "ns/name/2024-04-11T18:49:00+08:00", @@ -259,7 +284,21 @@ func TestVSProgress(t *testing.T) { expectedProgress: &velero.OperationProgress{Completed: true}, }, { - name: "VSC status has error", + name: "VSC status has error within CSISnapshotTimeout", + operationID: "ns/name/" + time.Now().Format(time.RFC3339), + vs: builder.ForVolumeSnapshot("ns", "name").Status(). + ReadyToUse(true).BoundVolumeSnapshotContentName("vsc").Result(), + vsc: builder.ForVolumeSnapshotContent("vsc"). + Status(&snapshotv1api.VolumeSnapshotContentStatus{ + Error: &snapshotv1api.VolumeSnapshotError{ + Message: &errorStr, + }, + }).Result(), + backup: builder.ForBackup("velero", "backup").CSISnapshotTimeout(10 * time.Minute).Result(), + expectedErr: false, + }, + { + name: "VSC status has persistent error beyond CSISnapshotTimeout", operationID: "ns/name/2024-04-11T18:49:00+08:00", vs: builder.ForVolumeSnapshot("ns", "name").Status(). ReadyToUse(true).BoundVolumeSnapshotContentName("vsc").Result(), @@ -269,11 +308,11 @@ func TestVSProgress(t *testing.T) { Message: &errorStr, }, }).Result(), - backup: builder.ForBackup("velero", "backup").Result(), + backup: builder.ForBackup("velero", "backup").CSISnapshotTimeout(10 * time.Minute).Result(), expectedErr: false, expectedProgress: &velero.OperationProgress{ Completed: true, - Err: "error", + Err: fmt.Sprintf("VolumeSnapshotContent vsc has a persistent error: %s", errorStr), }, }, }