Skip to content

Commit b3b2fe6

Browse files
authored
fix(docker): restart and wait for container readiness before exec (#688)
When Podman on Windows encounters instability, a container can exit during or after the first exec. Previously, the injection retry loop would retry docker exec 30 times against a dead container without ever attempting to restart it. Add WaitContainerRunning to DockerHelper that polls docker inspect until the container reports running status, following the same wait.PollUntilContextTimeout pattern used by the Kubernetes driver. CommandDevContainer now detects stopped containers, restarts them, and waits for readiness before proceeding with exec. The injection retry predicate in inject.go now fails fast on terminal container states (dead/removing) using a sentinel error instead of burning through all 30 retries.
1 parent bdc7124 commit b3b2fe6

3 files changed

Lines changed: 86 additions & 0 deletions

File tree

pkg/agent/inject.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313

1414
"github.com/sirupsen/logrus"
1515
"github.com/skevetter/devpod/pkg/config"
16+
"github.com/skevetter/devpod/pkg/docker"
1617
"github.com/skevetter/devpod/pkg/inject"
1718
"github.com/skevetter/devpod/pkg/shell"
1819
"github.com/skevetter/devpod/pkg/version"
@@ -193,6 +194,10 @@ func InjectAgent(opts *InjectOptions) error {
193194
if opts.Ctx.Err() != nil {
194195
return false
195196
}
197+
if errors.Is(err, docker.ErrContainerTerminal) {
198+
opts.Log.Errorf("container entered a terminal state, not retrying: %v", err)
199+
return false
200+
}
196201
opts.Log.Debugf("retrying injection: %v", err)
197202
return true
198203
}, func() error {

pkg/docker/helper.go

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,14 @@ import (
1010
"os"
1111
"os/exec"
1212
"strings"
13+
"time"
1314

1415
"github.com/skevetter/devpod/pkg/command"
1516
"github.com/skevetter/devpod/pkg/devcontainer/config"
1617
"github.com/skevetter/devpod/pkg/image"
1718
"github.com/skevetter/log"
1819
"github.com/skevetter/log/scanner"
20+
"k8s.io/apimachinery/pkg/util/wait"
1921
)
2022

2123
// DockerBuilder represents the Docker builder types.
@@ -204,6 +206,62 @@ func (r *DockerHelper) StartContainer(ctx context.Context, containerId string) e
204206
return nil
205207
}
206208

209+
// ErrContainerTerminal indicates a container entered an unrecoverable state
210+
// (e.g. "dead" or "removing") and cannot be restarted.
211+
var ErrContainerTerminal = errors.New("container in terminal state")
212+
213+
const (
214+
containerRunningPollInterval = 500 * time.Millisecond
215+
containerRunningTimeout = 30 * time.Second
216+
)
217+
218+
// WaitContainerRunning polls docker inspect until the container reports
219+
// status "running" or the context/timeout expires. It does not start
220+
// the container — the caller is responsible for that.
221+
func (r *DockerHelper) WaitContainerRunning(ctx context.Context, containerID string) error {
222+
var lastErr error
223+
pollErr := wait.PollUntilContextTimeout(
224+
ctx, containerRunningPollInterval, containerRunningTimeout, true,
225+
func(ctx context.Context) (bool, error) {
226+
details, err := r.InspectContainers(ctx, []string{containerID})
227+
if err != nil {
228+
lastErr = err
229+
r.Log.Debugf("WaitContainerRunning: inspect error (will retry): %v", err)
230+
return false, nil
231+
}
232+
if len(details) == 0 {
233+
return false, fmt.Errorf(
234+
"container %s disappeared while waiting for it to start",
235+
containerID,
236+
)
237+
}
238+
lastErr = nil
239+
status := strings.ToLower(details[0].State.Status)
240+
if status == "running" {
241+
return true, nil
242+
}
243+
if status == "removing" || status == "dead" {
244+
return false, fmt.Errorf(
245+
"%w: container %s is %q",
246+
ErrContainerTerminal,
247+
containerID,
248+
status,
249+
)
250+
}
251+
r.Log.Debugf(
252+
"WaitContainerRunning: container %s status=%s, waiting...",
253+
containerID,
254+
status,
255+
)
256+
return false, nil
257+
},
258+
)
259+
if pollErr != nil && lastErr != nil {
260+
return fmt.Errorf("%w (last inspect error: %v)", pollErr, lastErr)
261+
}
262+
return pollErr
263+
}
264+
207265
func (r *DockerHelper) GetImageTag(ctx context.Context, imageID string) (string, error) {
208266
args := []string{
209267
"inspect",

pkg/driver/docker/docker.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,29 @@ func (d *dockerDriver) CommandDevContainer(
9191
return fmt.Errorf("container not found")
9292
}
9393

94+
status := strings.ToLower(container.State.Status)
95+
if status == "dead" || status == "removing" {
96+
return fmt.Errorf(
97+
"%w: container %s is %q",
98+
docker.ErrContainerTerminal,
99+
container.ID,
100+
status,
101+
)
102+
}
103+
if status != "running" {
104+
d.Log.Infof(
105+
"container %s is not running (status=%s), restarting",
106+
container.ID, status,
107+
)
108+
if err := d.Docker.StartContainer(ctx, container.ID); err != nil {
109+
return fmt.Errorf("restart container: %w", err)
110+
}
111+
if err := d.Docker.WaitContainerRunning(ctx, container.ID); err != nil {
112+
return fmt.Errorf("wait for container to be running: %w", err)
113+
}
114+
d.Log.Infof("container %s is now running", container.ID)
115+
}
116+
94117
args := []string{"exec"}
95118
if stdin != nil {
96119
args = append(args, "-i")

0 commit comments

Comments
 (0)