Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions docs/architecture-overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,16 +73,22 @@ markFailed() -> UPDATE status='failed' (retry < 3)
Self-healing: messages in 'processing' for >60s reset to 'pending'
```

### Circuit-Breaker (SessionRoutes)
### Circuit-Breaker (SessionRoutes / WorkerService)

```text
Generator crash -> retry 1 (1s) -> retry 2 (2s) -> retry 3 (4s)
-> consecutiveRestarts > 3 -> CIRCUIT-BREAKER
Generator crash -> retry 1 (1s) -> retry 2 (2s) -> retry 3 (4s) -> ...
-> windowed guard: >5 restarts in 60s -> CIRCUIT-BREAKER
-> markAllSessionMessagesAbandoned(sessionDbId)
-> Stop. No infinite loop.
```

Counter resets to 0 when generator completes work naturally.
Uses a **windowed restart guard** (see `src/services/worker/RestartGuard.ts`):
only restarts within a 60-second window count toward the limit.
Long-running sessions that occasionally restart will never trip the guard;
tight crash-loops (e.g. persistent FK error) are caught within seconds.

Counter decays automatically as timestamps leave the window, and resets
fully on clean completion (no pending work).

### Graceful Degradation (hook-command.ts)

Expand Down
29 changes: 19 additions & 10 deletions src/services/worker-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ import { SearchManager } from './worker/SearchManager.js';
import { FormattingService } from './worker/FormattingService.js';
import { TimelineService } from './worker/TimelineService.js';
import { SessionEventBroadcaster } from './worker/events/SessionEventBroadcaster.js';
import { recordRestart, resetRestarts, RESTART_WINDOW_MS, MAX_RESTARTS_IN_WINDOW } from './worker/RestartGuard.js';
import { DEFAULT_CONFIG_PATH, DEFAULT_STATE_PATH, expandHomePath, loadTranscriptWatchConfig, writeSampleConfig } from './transcripts/config.js';
import { TranscriptWatcher } from './transcripts/watcher.js';

Expand Down Expand Up @@ -729,27 +730,35 @@ export class WorkerService {
}
// Fall through to pending-work restart below
}
const MAX_PENDING_RESTARTS = 3;

if (pendingCount > 0) {
// Track consecutive pending-work restarts to prevent infinite loops (e.g. FK errors)
session.consecutiveRestarts = (session.consecutiveRestarts || 0) + 1;
// Initialize restartTimestamps for sessions created before the field existed
if (!session.restartTimestamps) {
session.restartTimestamps = [];
}

// Windowed restart guard — only counts restarts within a recent window
const allowed = recordRestart(session);

if (session.consecutiveRestarts > MAX_PENDING_RESTARTS) {
logger.error('SYSTEM', 'Exceeded max pending-work restarts, stopping to prevent infinite loop', {
if (!allowed) {
logger.error('SYSTEM', 'Exceeded max pending-work restarts (windowed), stopping to prevent infinite loop', {
sessionId: session.sessionDbId,
pendingCount,
consecutiveRestarts: session.consecutiveRestarts
consecutiveRestarts: session.consecutiveRestarts,
restartsInWindow: session.restartTimestamps.length,
windowMs: RESTART_WINDOW_MS,
maxRestartsInWindow: MAX_RESTARTS_IN_WINDOW
});
session.consecutiveRestarts = 0;
resetRestarts(session);
this.terminateSession(session.sessionDbId, 'max_restarts_exceeded');
return;
}

logger.info('SYSTEM', 'Pending work remains after generator exit, restarting with fresh AbortController', {
sessionId: session.sessionDbId,
pendingCount,
attempt: session.consecutiveRestarts
attempt: session.consecutiveRestarts,
restartsInWindow: session.restartTimestamps.length,
maxRestartsInWindow: MAX_RESTARTS_IN_WINDOW
});
// Reset AbortController for restart
session.abortController = new AbortController();
Expand All @@ -759,7 +768,7 @@ export class WorkerService {
} else {
// Successful completion with no pending work — clean up session
// removeSessionImmediate fires onSessionDeletedCallback → broadcastProcessingStatus()
session.consecutiveRestarts = 0;
resetRestarts(session);
this.sessionManager.removeSessionImmediate(session.sessionDbId);
}
});
Expand Down
1 change: 1 addition & 0 deletions src/services/worker-types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ export interface ActiveSession {
conversationHistory: ConversationMessage[]; // Shared conversation history for provider switching
currentProvider: 'claude' | 'gemini' | 'openrouter' | null; // Track which provider is currently running
consecutiveRestarts: number; // Track consecutive restart attempts to prevent infinite loops
restartTimestamps: number[]; // Windowed restart tracking — timestamps of recent restarts (see RestartGuard.ts)
forceInit?: boolean; // Force fresh SDK session (skip resume)
idleTimedOut?: boolean; // Set when session exits due to idle timeout (prevents restart loop)
lastGeneratorActivity: number; // Timestamp of last generator progress (for stale detection, Issue #1099)
Expand Down
95 changes: 95 additions & 0 deletions src/services/worker/RestartGuard.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/**
* Windowed Restart Guard
*
* Replaces the flat `consecutiveRestarts` counter with a time-windowed
* approach. Only restarts within a recent window are counted, so a
* long-running session that occasionally restarts will never hit the
* cap, while a tight crash-loop (persistent FK error, missing session
* ID, etc.) will trip the guard within seconds.
*
* Both `worker-service.ts` and `SessionRoutes.ts` share this module so
* the logic stays in one place.
*
* Issue: Generator restart guard strands pending messages with no recovery
*/

// ---------------------------------------------------------------------------
// Tunables
// ---------------------------------------------------------------------------

/** Time window (ms) in which restarts are counted. Restarts older than
* this are pruned and no longer contribute to the count. */
export const RESTART_WINDOW_MS = 60_000; // 60 seconds

/** Maximum restarts allowed inside the window before tripping the guard.
* "5 restarts in 60 s" catches tight loops while allowing healthy
* sessions to restart a handful of times per hour without issue. */
export const MAX_RESTARTS_IN_WINDOW = 5;

// ---------------------------------------------------------------------------
// Interface
// ---------------------------------------------------------------------------

/**
* Minimal shape that any object must satisfy to participate in windowed
* restart tracking. `ActiveSession` satisfies this after the type
* update.
*/
export interface RestartTracker {
restartTimestamps: number[];
consecutiveRestarts: number;
}

// ---------------------------------------------------------------------------
// Core helpers
// ---------------------------------------------------------------------------

/**
* Record a restart attempt and decide whether it should be allowed.
*
* 1. Prune timestamps older than `RESTART_WINDOW_MS`.
* 2. Push the current timestamp.
* 3. Sync `consecutiveRestarts` (kept for backward-compat logging).
* 4. Return `true` if the restart is within budget, `false` to block.
*
* @param tracker Session (or test stub) that holds the timestamps.
* @param now Current epoch ms — injectable for deterministic tests.
*/
export function recordRestart(
tracker: RestartTracker,
now: number = Date.now(),
): boolean {
// Prune stale entries
tracker.restartTimestamps = tracker.restartTimestamps.filter(
(ts) => now - ts < RESTART_WINDOW_MS,
);

// Record this restart
tracker.restartTimestamps.push(now);

// Keep legacy field in sync for log output / backcompat
tracker.consecutiveRestarts = tracker.restartTimestamps.length;

return tracker.restartTimestamps.length <= MAX_RESTARTS_IN_WINDOW;
}

/**
* Reset the tracker — called on clean completion (no pending work).
*/
export function resetRestarts(tracker: RestartTracker): void {
tracker.restartTimestamps = [];
tracker.consecutiveRestarts = 0;
}

/**
* Return the number of restarts still inside the current window.
* Useful for logging / diagnostics without mutating the tracker.
*/
export function getRecentRestartCount(
tracker: RestartTracker,
now: number = Date.now(),
): number {
return tracker.restartTimestamps.filter(
(ts) => now - ts < RESTART_WINDOW_MS,
).length;
}
1 change: 1 addition & 0 deletions src/services/worker/SessionManager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ export class SessionManager {
conversationHistory: [], // Initialize empty - will be populated by agents
currentProvider: null, // Will be set when generator starts
consecutiveRestarts: 0, // Track consecutive restart attempts to prevent infinite loops
restartTimestamps: [], // Windowed restart tracking (see RestartGuard.ts)
processingMessageIds: [], // CLAIM-CONFIRM: Track message IDs for confirmProcessed()
lastGeneratorActivity: Date.now() // Initialize for stale detection (Issue #1099)
};
Expand Down
36 changes: 25 additions & 11 deletions src/services/worker/http/routes/SessionRoutes.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import { SessionCompletionHandler } from '../../session/SessionCompletionHandler
import { PrivacyCheckValidator } from '../../validation/PrivacyCheckValidator.js';
import { SettingsDefaultsManager } from '../../../../shared/SettingsDefaultsManager.js';
import { USER_SETTINGS_PATH } from '../../../../shared/paths.js';
import { recordRestart, resetRestarts, RESTART_WINDOW_MS, MAX_RESTARTS_IN_WINDOW } from '../../RestartGuard.js';
import { getProcessBySession, ensureProcessExit } from '../../ProcessRegistry.js';
import { getProjectContext } from '../../../../utils/project-name.js';
import { normalizePlatformSource } from '../../../../shared/platform-source.js';
Expand Down Expand Up @@ -289,9 +290,9 @@ export class SessionRoutes extends BaseRouteHandler {
const pendingStore = this.sessionManager.getPendingMessageStore();
const pendingCount = pendingStore.getPendingCount(sessionDbId);

// CRITICAL: Limit consecutive restarts to prevent infinite loops
// This prevents runaway API costs when there's a persistent error (e.g., memorySessionId not captured)
const MAX_CONSECUTIVE_RESTARTS = 3;
// CRITICAL: Windowed restart guard — only counts restarts within a recent
// time window to prevent tight crash-loops while allowing healthy long-running
// sessions to restart occasionally without hitting the cap. (RestartGuard.ts)

if (pendingCount > 0) {
// GUARD: Prevent duplicate crash recovery spawns
Expand All @@ -300,17 +301,29 @@ export class SessionRoutes extends BaseRouteHandler {
return;
}

session.consecutiveRestarts = (session.consecutiveRestarts || 0) + 1;
// Initialize restartTimestamps for sessions created before the field existed
if (!session.restartTimestamps) {
session.restartTimestamps = [];
}

const allowed = recordRestart(session);

if (session.consecutiveRestarts > MAX_CONSECUTIVE_RESTARTS) {
if (!allowed) {
logger.error('SESSION', `CRITICAL: Generator restart limit exceeded - stopping to prevent runaway costs`, {
sessionId: sessionDbId,
pendingCount,
consecutiveRestarts: session.consecutiveRestarts,
maxRestarts: MAX_CONSECUTIVE_RESTARTS,
action: 'Generator will NOT restart. Check logs for root cause. Messages remain in pending state.'
restartsInWindow: session.restartTimestamps.length,
windowMs: RESTART_WINDOW_MS,
maxRestartsInWindow: MAX_RESTARTS_IN_WINDOW,
action: 'Generator will NOT restart. Messages will be marked abandoned.'
});
// Mark pending messages as abandoned so they don't strand forever
const abandoned = pendingStore.markAllSessionMessagesAbandoned(sessionDbId);
logger.info('SESSION', 'Marked stranded messages as abandoned after restart guard trip', {
sessionId: sessionDbId,
abandoned
});
// Don't restart - abort to prevent further API calls
session.abortController.abort();
return;
}
Expand All @@ -319,7 +332,8 @@ export class SessionRoutes extends BaseRouteHandler {
sessionId: sessionDbId,
pendingCount,
consecutiveRestarts: session.consecutiveRestarts,
maxRestarts: MAX_CONSECUTIVE_RESTARTS
restartsInWindow: session.restartTimestamps.length,
maxRestartsInWindow: MAX_RESTARTS_IN_WINDOW
});

// Abort OLD controller before replacing to prevent child process leaks
Expand All @@ -344,8 +358,8 @@ export class SessionRoutes extends BaseRouteHandler {
} else {
// No pending work - abort to kill the child process
session.abortController.abort();
// Reset restart counter on successful completion
session.consecutiveRestarts = 0;
// Reset restart tracker on successful completion
resetRestarts(session);
logger.debug('SESSION', 'Aborted controller after natural completion', {
sessionId: sessionDbId
});
Expand Down
Loading