-
Notifications
You must be signed in to change notification settings - Fork 225
Free orphaned proxy port on stop and rm #5394
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
gkatz2
wants to merge
1
commit into
stacklok:main
Choose a base branch
from
gkatz2:fix/orphan-proxy-on-stop-rm-5393
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+323
−27
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -101,6 +101,7 @@ type DefaultManager struct { | |
| retryConfig *retryConfig | ||
| newRunner mcpRunnerFactory | ||
| detachedSpawner detachedProcessSpawner | ||
| portFreer portFreer | ||
| } | ||
|
|
||
| // mcpRunner is the subset of *runner.Runner that RunWorkload's retry loop | ||
|
|
@@ -123,6 +124,11 @@ type mcpRunnerFactory func(*runner.RunConfig, statuses.StatusManager) mcpRunner | |
| // itself. | ||
| type detachedProcessSpawner func(ctx context.Context, runConfig *runner.RunConfig) (pid int, err error) | ||
|
|
||
| // portFreer kills an orphaned ToolHive proxy still holding a workload's proxy | ||
| // port. In production it is freePortHolderIfNeeded; tests override it to observe | ||
| // the stop/delete fallback without making real OS calls. | ||
| type portFreer func(ctx context.Context, runConfig *runner.RunConfig) | ||
|
|
||
| // retryConfig bundles the RunWorkload retry loop's tunable parameters. | ||
| type retryConfig struct { | ||
| maxRetries int | ||
|
|
@@ -160,6 +166,12 @@ func withDetachedSpawner(s detachedProcessSpawner) managerOption { | |
| return func(d *DefaultManager) { d.detachedSpawner = s } | ||
| } | ||
|
|
||
| // withPortFreer overrides the function used to free an orphaned proxy port | ||
| // during stop/delete. Intended for tests so they do not make real OS calls. | ||
| func withPortFreer(f portFreer) managerOption { | ||
| return func(d *DefaultManager) { d.portFreer = f } | ||
| } | ||
|
|
||
| // retryConfigOrDefault returns the manager's retryConfig if set, otherwise defaults. | ||
| func (d *DefaultManager) retryConfigOrDefault() retryConfig { | ||
| if d.retryConfig == nil { | ||
|
|
@@ -187,6 +199,15 @@ func (d *DefaultManager) detachedSpawnerOrDefault() detachedProcessSpawner { | |
| return d.spawnDetached | ||
| } | ||
|
|
||
| // portFreerOrDefault returns the manager's port freer if set, otherwise the | ||
| // production default that frees the proxy port via freePortHolderIfNeeded. | ||
| func (d *DefaultManager) portFreerOrDefault() portFreer { | ||
| if d.portFreer != nil { | ||
| return d.portFreer | ||
| } | ||
| return d.freePortHolderIfNeeded | ||
| } | ||
|
|
||
| // ErrWorkloadNotRunning is returned when a container cannot be found by name. | ||
| var ErrWorkloadNotRunning = fmt.Errorf("workload not running") | ||
|
|
||
|
|
@@ -407,9 +428,10 @@ func (d *DefaultManager) stopSingleWorkload(ctx context.Context, name string) er | |
| // First, try to load the run configuration to check if it's a remote workload | ||
| runConfig, err := runner.LoadState(childCtx, name) | ||
| if err != nil { | ||
| // If we can't load the state, it might be a container workload or the workload doesn't exist | ||
| // Try to stop it as a container workload | ||
| return d.stopContainerWorkload(childCtx, name) | ||
| // If we can't load the state, it might be a container workload or the workload doesn't exist. | ||
| // Try to stop it as a container workload. Without a run config we cannot recover the proxy | ||
| // port for port-based cleanup, so pass nil. | ||
| return d.stopContainerWorkload(childCtx, name, nil) | ||
| } | ||
|
|
||
| // Check if this is a remote workload | ||
|
|
@@ -418,7 +440,7 @@ func (d *DefaultManager) stopSingleWorkload(ctx context.Context, name string) er | |
| } | ||
|
|
||
| // This is a container-based workload | ||
| return d.stopContainerWorkload(childCtx, name) | ||
| return d.stopContainerWorkload(childCtx, name, runConfig) | ||
| } | ||
|
|
||
| // stopRemoteWorkload stops a remote workload | ||
|
|
@@ -467,8 +489,10 @@ func (d *DefaultManager) stopRemoteWorkload(ctx context.Context, name string, ru | |
| return nil | ||
| } | ||
|
|
||
| // stopContainerWorkload stops a container-based workload | ||
| func (d *DefaultManager) stopContainerWorkload(ctx context.Context, name string) error { | ||
| // stopContainerWorkload stops a container-based workload. runConfig may be nil | ||
| // when the run config could not be loaded; it is used only for port-based proxy | ||
| // cleanup when the tracked PID is unavailable. | ||
| func (d *DefaultManager) stopContainerWorkload(ctx context.Context, name string, runConfig *runner.RunConfig) error { | ||
| container, err := d.runtime.GetWorkloadInfo(ctx, name) | ||
| if err != nil { | ||
| if errors.Is(err, rt.ErrWorkloadNotFound) { | ||
|
|
@@ -492,7 +516,7 @@ func (d *DefaultManager) stopContainerWorkload(ctx context.Context, name string) | |
| } | ||
|
|
||
| // Use the existing stopWorkloads method for container workloads | ||
| return d.stopSingleContainerWorkload(ctx, &container) | ||
| return d.stopSingleContainerWorkload(ctx, &container, runConfig) | ||
| } | ||
|
|
||
| // RunWorkload runs a workload in the foreground with automatic restart on container exit. | ||
|
|
@@ -842,9 +866,10 @@ func (d *DefaultManager) deleteWorkload(ctx context.Context, name string) error | |
| // First, check if this is a remote workload by trying to load its run configuration | ||
| runConfig, err := runner.LoadState(childCtx, name) | ||
| if err != nil { | ||
| // If we can't load the state, it might be a container workload or the workload doesn't exist | ||
| // Continue with the container-based deletion logic | ||
| return d.deleteContainerWorkload(childCtx, name) | ||
| // If we can't load the state, it might be a container workload or the workload doesn't exist. | ||
| // Continue with the container-based deletion logic. Without a run config we cannot recover the | ||
| // proxy port for port-based cleanup, so pass nil. | ||
| return d.deleteContainerWorkload(childCtx, name, nil) | ||
| } | ||
|
|
||
| // If this is a remote workload (has RemoteURL), handle it differently | ||
|
|
@@ -853,7 +878,7 @@ func (d *DefaultManager) deleteWorkload(ctx context.Context, name string) error | |
| } | ||
|
|
||
| // This is a container-based workload, use the existing logic | ||
| return d.deleteContainerWorkload(childCtx, name) | ||
| return d.deleteContainerWorkload(childCtx, name, runConfig) | ||
| } | ||
|
|
||
| // deleteRemoteWorkload handles deletion of a remote workload | ||
|
|
@@ -883,8 +908,10 @@ func (d *DefaultManager) deleteRemoteWorkload(ctx context.Context, name string, | |
| return nil | ||
| } | ||
|
|
||
| // deleteContainerWorkload handles deletion of a container-based workload (existing logic) | ||
| func (d *DefaultManager) deleteContainerWorkload(ctx context.Context, name string) error { | ||
| // deleteContainerWorkload handles deletion of a container-based workload (existing logic). | ||
| // runConfig may be nil; when present it lets delete fall back to port-based proxy cleanup | ||
| // if the tracked PID is unavailable (e.g. a missing status file). | ||
| func (d *DefaultManager) deleteContainerWorkload(ctx context.Context, name string, runConfig *runner.RunConfig) error { | ||
|
|
||
| // Find and validate the container | ||
| container, err := d.getWorkloadContainer(ctx, name) | ||
|
|
@@ -921,7 +948,11 @@ func (d *DefaultManager) deleteContainerWorkload(ctx context.Context, name strin | |
| // Stop proxy-runner process AFTER container removal to prevent recreation | ||
| // Skip for auxiliary workloads like inspector that don't use proxy processes | ||
| if !isAuxiliary { | ||
| d.stopProxyIfNeeded(ctx, name, baseName) | ||
| if !d.stopProxyIfNeeded(ctx, name, baseName) { | ||
| // PID-based stop found no tracked proxy (e.g. the status file is missing); | ||
| // fall back to port-based cleanup. | ||
| d.portFreerOrDefault()(ctx, runConfig) | ||
| } | ||
| } else { | ||
| slog.Debug("skipping proxy-runner stop for auxiliary workload", "workload", name) | ||
| } | ||
|
|
@@ -976,40 +1007,51 @@ func (d *DefaultManager) isSupervisorProcessAlive(ctx context.Context, name stri | |
| return true | ||
| } | ||
|
|
||
| // stopProcess stops the proxy process associated with the container | ||
| func (d *DefaultManager) stopProcess(ctx context.Context, name string) { | ||
| // stopProcess stops the proxy process associated with the container. It reports | ||
| // whether a tracked proxy process was actually stopped: true only when a valid | ||
| // PID was found and killed. A false return (no PID recorded, e.g. a missing | ||
| // status file, or the kill failed) signals callers to fall back to port-based | ||
| // cleanup so an orphaned proxy does not keep holding the workload's port. | ||
| func (d *DefaultManager) stopProcess(ctx context.Context, name string) bool { | ||
| if name == "" { | ||
| slog.Warn("could not find base container name in labels") | ||
| return | ||
| return false | ||
| } | ||
|
|
||
| // Try to read the PID and kill the process | ||
| pid, err := d.statuses.GetWorkloadPID(ctx, name) | ||
| if err != nil { | ||
| slog.Debug("no PID found, proxy may not be running in detached mode", "workload", name) | ||
| return | ||
| return false | ||
| } | ||
|
|
||
| // PID found, try to kill the process | ||
| slog.Debug("stopping proxy process", "pid", pid) | ||
| stopped := false | ||
| if err := process.KillProcess(pid); err != nil { | ||
| slog.Debug("failed to kill proxy process", "error", err) | ||
| } else { | ||
| slog.Debug("proxy process stopped") | ||
| stopped = true | ||
| } | ||
|
|
||
| // Remove the PID of the terminated process | ||
| if err := d.statuses.ResetWorkloadPID(ctx, name); err != nil { | ||
| slog.Warn("failed to reset workload PID", "workload", name, "error", err) | ||
| } | ||
|
|
||
| return stopped | ||
| } | ||
|
|
||
| // stopProxyIfNeeded stops the proxy process if the workload has a base name | ||
| func (d *DefaultManager) stopProxyIfNeeded(ctx context.Context, name, baseName string) { | ||
| // stopProxyIfNeeded stops the proxy process if the workload has a base name. It | ||
| // reports whether a tracked proxy process was stopped (see stopProcess); false | ||
| // when there is no base name or no proxy was killed. | ||
| func (d *DefaultManager) stopProxyIfNeeded(ctx context.Context, name, baseName string) bool { | ||
| slog.Debug("removing proxy process", "workload", name) | ||
| if baseName != "" { | ||
| d.stopProcess(ctx, baseName) | ||
| if baseName == "" { | ||
| return false | ||
| } | ||
| return d.stopProcess(ctx, baseName) | ||
| } | ||
|
|
||
| // freePortHolderIfNeeded kills the process holding the proxy port if it is in use. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doc comment only describes the restart re-bind case, but after this change freePortHolderIfNeeded is also the fallback used during stop/delete, where there is no child re-binding. Could you broaden it to cover both uses so it doesn't read as restart-only? |
||
|
|
@@ -1324,7 +1366,7 @@ func (d *DefaultManager) maybeSetupRemoteWorkload( | |
|
|
||
| // Ensure port is free before spawning. Kill the process holding the port if bound. | ||
| // This prevents "address already in use" when the new child tries to bind. | ||
| d.freePortHolderIfNeeded(ctx, mcpRunner.Config) | ||
| d.portFreerOrDefault()(ctx, mcpRunner.Config) | ||
|
|
||
| slog.Debug("loaded configuration from state", "workload", runConfig.BaseName) | ||
| return mcpRunner, nil | ||
|
|
@@ -1562,17 +1604,23 @@ func (*DefaultManager) cleanupTempPermissionProfile(ctx context.Context, baseNam | |
| return nil | ||
| } | ||
|
|
||
| // stopSingleContainerWorkload stops a single container workload | ||
| func (d *DefaultManager) stopSingleContainerWorkload(ctx context.Context, workload *rt.ContainerInfo) error { | ||
| // stopSingleContainerWorkload stops a single container workload. runConfig may | ||
| // be nil; when present it lets stop fall back to port-based proxy cleanup if the | ||
| // tracked PID is unavailable (e.g. a missing status file). | ||
| func (d *DefaultManager) stopSingleContainerWorkload( | ||
| ctx context.Context, workload *rt.ContainerInfo, runConfig *runner.RunConfig, | ||
| ) error { | ||
| childCtx, cancel := context.WithTimeout(context.Background(), AsyncOperationTimeout) | ||
| defer cancel() | ||
|
|
||
| name := labels.GetContainerBaseName(workload.Labels) | ||
| // Stop the proxy process (skip for auxiliary workloads like inspector) | ||
| if labels.IsAuxiliaryWorkload(workload.Labels) { | ||
| slog.Debug("skipping proxy stop for auxiliary workload", "workload", name) | ||
| } else { | ||
| d.stopProcess(ctx, name) | ||
| } else if !d.stopProcess(ctx, name) { | ||
| // PID-based stop found no tracked proxy (e.g. the status file is missing); | ||
| // fall back to port-based cleanup. | ||
| d.portFreerOrDefault()(ctx, runConfig) | ||
| } | ||
|
|
||
| slog.Debug("stopping containers", "workload", name) | ||
|
|
||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now that stopProxyIfNeeded reports whether a proxy was actually stopped, the container stop/delete paths use it to trigger the port-based fallback. stopRemoteWorkload (around line 472) still calls this and discards the result, so a remote workload with a missing status file wouldn't get the orphan-port cleanup. Remote workloads run a local proxy that holds a port too, so the same gap seems to apply there. Is the remote path intentionally out of scope for this PR, or worth adding the same
if !... { portFreerOrDefault()(ctx, runConfig) }fallback?