Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelogs/unreleased/9675-priyansh17
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix backup stuck in WaitingForPluginOperations when CSI VolumeSnapshot has a persistent error beyond CSISnapshotTimeout
37 changes: 31 additions & 6 deletions pkg/backup/actions/csi/volumesnapshot_action.go
Original file line number Diff line number Diff line change
Expand Up @@ -300,8 +300,21 @@ func (p *volumeSnapshotBackupItemAction) Progress(
if vs.Status.Error.Message != nil {
errorMessage = *vs.Status.Error.Message
}
p.log.Warnf("VolumeSnapshot has a temporary error %s. Snapshot controller will retry later.",
errorMessage)

timeout := backup.Spec.CSISnapshotTimeout.Duration
if timeout > 0 && time.Since(progress.Started) >= timeout {
p.log.Errorf(
"VolumeSnapshot %s/%s has a persistent error beyond CSISnapshotTimeout (%s): %s",
vs.Namespace, vs.Name, timeout, errorMessage)
progress.Completed = true
progress.Updated = time.Now()
progress.Err = fmt.Sprintf("VolumeSnapshot %s/%s has a persistent error: %s",
vs.Namespace, vs.Name, errorMessage)
return progress, nil
}

p.log.Warnf("VolumeSnapshot %s/%s has an error within the CSISnapshotTimeout window: %s. Snapshot controller will retry later.",
vs.Namespace, vs.Name, errorMessage)

return progress, nil
}
Expand Down Expand Up @@ -331,12 +344,24 @@ func (p *volumeSnapshotBackupItemAction) Progress(
progress.Completed = true
progress.Updated = now
} else if vsc.Status.Error != nil {
progress.Completed = true
progress.Updated = now
errorMessage := ""
if vsc.Status.Error.Message != nil {
progress.Err = *vsc.Status.Error.Message
errorMessage = *vsc.Status.Error.Message
}

timeout := backup.Spec.CSISnapshotTimeout.Duration
if timeout > 0 && time.Since(progress.Started) >= timeout {
p.log.Errorf(
"VolumeSnapshotContent %s has a persistent error beyond CSISnapshotTimeout (%s): %s",
vsc.Name, timeout, errorMessage)
progress.Completed = true
progress.Updated = now
progress.Err = fmt.Sprintf("VolumeSnapshotContent %s has a persistent error: %s",
vsc.Name, errorMessage)
} else {
p.log.Warnf("VolumeSnapshotContent %s has an error within the CSISnapshotTimeout window: %s. Snapshot controller will retry later.",
vsc.Name, errorMessage)
}
p.log.Warnf("VolumeSnapshotContent meets an error %s.", progress.Err)
}
}

Expand Down
47 changes: 43 additions & 4 deletions pkg/backup/actions/csi/volumesnapshot_action_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package csi
import (
"fmt"
"testing"
"time"

"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
Expand Down Expand Up @@ -219,7 +220,7 @@ func TestVSProgress(t *testing.T) {
expectedErr: false,
},
{
name: "VS status has error",
name: "VS status has error, no prior annotation, no CSISnapshotTimeout configured",
operationID: "ns/name/2024-04-11T18:49:00+08:00",
vs: builder.ForVolumeSnapshot("ns", "name").Status().
StatusError(snapshotv1api.VolumeSnapshotError{
Expand All @@ -228,6 +229,30 @@ func TestVSProgress(t *testing.T) {
backup: builder.ForBackup("velero", "backup").Result(),
expectedErr: false,
},
{
name: "VS status has error, within CSISnapshotTimeout (recent start time)",
operationID: "ns/name/" + time.Now().Format(time.RFC3339),
vs: builder.ForVolumeSnapshot("ns", "name").Status().
StatusError(snapshotv1api.VolumeSnapshotError{
Message: &errorStr,
}).Result(),
backup: builder.ForBackup("velero", "backup").CSISnapshotTimeout(10 * time.Minute).Result(),
expectedErr: false,
},
{
name: "VS status has persistent error beyond CSISnapshotTimeout",
operationID: "ns/name/2024-04-11T18:49:00+08:00",
vs: builder.ForVolumeSnapshot("ns", "name").Status().
StatusError(snapshotv1api.VolumeSnapshotError{
Message: &errorStr,
}).Result(),
backup: builder.ForBackup("velero", "backup").CSISnapshotTimeout(10 * time.Minute).Result(),
expectedErr: false,
expectedProgress: &velero.OperationProgress{
Completed: true,
Err: fmt.Sprintf("VolumeSnapshot ns/name has a persistent error: %s", errorStr),
},
},
{
name: "Fail to get VSC",
operationID: "ns/name/2024-04-11T18:49:00+08:00",
Expand Down Expand Up @@ -259,7 +284,21 @@ func TestVSProgress(t *testing.T) {
expectedProgress: &velero.OperationProgress{Completed: true},
},
{
name: "VSC status has error",
name: "VSC status has error within CSISnapshotTimeout",
operationID: "ns/name/" + time.Now().Format(time.RFC3339),
vs: builder.ForVolumeSnapshot("ns", "name").Status().
ReadyToUse(true).BoundVolumeSnapshotContentName("vsc").Result(),
vsc: builder.ForVolumeSnapshotContent("vsc").
Status(&snapshotv1api.VolumeSnapshotContentStatus{
Error: &snapshotv1api.VolumeSnapshotError{
Message: &errorStr,
},
}).Result(),
backup: builder.ForBackup("velero", "backup").CSISnapshotTimeout(10 * time.Minute).Result(),
expectedErr: false,
},
{
name: "VSC status has persistent error beyond CSISnapshotTimeout",
operationID: "ns/name/2024-04-11T18:49:00+08:00",
vs: builder.ForVolumeSnapshot("ns", "name").Status().
ReadyToUse(true).BoundVolumeSnapshotContentName("vsc").Result(),
Expand All @@ -269,11 +308,11 @@ func TestVSProgress(t *testing.T) {
Message: &errorStr,
},
}).Result(),
backup: builder.ForBackup("velero", "backup").Result(),
backup: builder.ForBackup("velero", "backup").CSISnapshotTimeout(10 * time.Minute).Result(),
expectedErr: false,
expectedProgress: &velero.OperationProgress{
Completed: true,
Err: "error",
Err: fmt.Sprintf("VolumeSnapshotContent vsc has a persistent error: %s", errorStr),
},
},
}
Expand Down
Loading