diff --git a/docs/architecture-overview.md b/docs/architecture-overview.md index 2882780a5..5a817baf2 100644 --- a/docs/architecture-overview.md +++ b/docs/architecture-overview.md @@ -73,16 +73,22 @@ markFailed() -> UPDATE status='failed' (retry < 3) Self-healing: messages in 'processing' for >60s reset to 'pending' ``` -### Circuit-Breaker (SessionRoutes) +### Circuit-Breaker (SessionRoutes / WorkerService) ```text -Generator crash -> retry 1 (1s) -> retry 2 (2s) -> retry 3 (4s) - -> consecutiveRestarts > 3 -> CIRCUIT-BREAKER +Generator crash -> retry 1 (1s) -> retry 2 (2s) -> retry 3 (4s) -> ... + -> windowed guard: >5 restarts in 60s -> CIRCUIT-BREAKER -> markAllSessionMessagesAbandoned(sessionDbId) -> Stop. No infinite loop. ``` -Counter resets to 0 when generator completes work naturally. +Uses a **windowed restart guard** (see `src/services/worker/RestartGuard.ts`): +only restarts within a 60-second window count toward the limit. +Long-running sessions that occasionally restart will never trip the guard; +tight crash-loops (e.g. persistent FK error) are caught within seconds. + +Counter decays automatically as timestamps leave the window, and resets +fully on clean completion (no pending work). ### Graceful Degradation (hook-command.ts) diff --git a/src/services/worker-service.ts b/src/services/worker-service.ts index 7b36a84b1..8420df4b0 100644 --- a/src/services/worker-service.ts +++ b/src/services/worker-service.ts @@ -84,6 +84,7 @@ import { SearchManager } from './worker/SearchManager.js'; import { FormattingService } from './worker/FormattingService.js'; import { TimelineService } from './worker/TimelineService.js'; import { SessionEventBroadcaster } from './worker/events/SessionEventBroadcaster.js'; +import { recordRestart, resetRestarts, RESTART_WINDOW_MS, MAX_RESTARTS_IN_WINDOW } from './worker/RestartGuard.js'; import { DEFAULT_CONFIG_PATH, DEFAULT_STATE_PATH, expandHomePath, loadTranscriptWatchConfig, writeSampleConfig } from './transcripts/config.js'; import { TranscriptWatcher } from './transcripts/watcher.js'; @@ -729,19 +730,25 @@ export class WorkerService { } // Fall through to pending-work restart below } - const MAX_PENDING_RESTARTS = 3; - if (pendingCount > 0) { - // Track consecutive pending-work restarts to prevent infinite loops (e.g. FK errors) - session.consecutiveRestarts = (session.consecutiveRestarts || 0) + 1; + // Initialize restartTimestamps for sessions created before the field existed + if (!session.restartTimestamps) { + session.restartTimestamps = []; + } + + // Windowed restart guard — only counts restarts within a recent window + const allowed = recordRestart(session); - if (session.consecutiveRestarts > MAX_PENDING_RESTARTS) { - logger.error('SYSTEM', 'Exceeded max pending-work restarts, stopping to prevent infinite loop', { + if (!allowed) { + logger.error('SYSTEM', 'Exceeded max pending-work restarts (windowed), stopping to prevent infinite loop', { sessionId: session.sessionDbId, pendingCount, - consecutiveRestarts: session.consecutiveRestarts + consecutiveRestarts: session.consecutiveRestarts, + restartsInWindow: session.restartTimestamps.length, + windowMs: RESTART_WINDOW_MS, + maxRestartsInWindow: MAX_RESTARTS_IN_WINDOW }); - session.consecutiveRestarts = 0; + resetRestarts(session); this.terminateSession(session.sessionDbId, 'max_restarts_exceeded'); return; } @@ -749,7 +756,9 @@ export class WorkerService { logger.info('SYSTEM', 'Pending work remains after generator exit, restarting with fresh AbortController', { sessionId: session.sessionDbId, pendingCount, - attempt: session.consecutiveRestarts + attempt: session.consecutiveRestarts, + restartsInWindow: session.restartTimestamps.length, + maxRestartsInWindow: MAX_RESTARTS_IN_WINDOW }); // Reset AbortController for restart session.abortController = new AbortController(); @@ -759,7 +768,7 @@ export class WorkerService { } else { // Successful completion with no pending work — clean up session // removeSessionImmediate fires onSessionDeletedCallback → broadcastProcessingStatus() - session.consecutiveRestarts = 0; + resetRestarts(session); this.sessionManager.removeSessionImmediate(session.sessionDbId); } }); diff --git a/src/services/worker-types.ts b/src/services/worker-types.ts index 1a8557a0c..8a91659c3 100644 --- a/src/services/worker-types.ts +++ b/src/services/worker-types.ts @@ -35,6 +35,7 @@ export interface ActiveSession { conversationHistory: ConversationMessage[]; // Shared conversation history for provider switching currentProvider: 'claude' | 'gemini' | 'openrouter' | null; // Track which provider is currently running consecutiveRestarts: number; // Track consecutive restart attempts to prevent infinite loops + restartTimestamps: number[]; // Windowed restart tracking — timestamps of recent restarts (see RestartGuard.ts) forceInit?: boolean; // Force fresh SDK session (skip resume) idleTimedOut?: boolean; // Set when session exits due to idle timeout (prevents restart loop) lastGeneratorActivity: number; // Timestamp of last generator progress (for stale detection, Issue #1099) diff --git a/src/services/worker/RestartGuard.ts b/src/services/worker/RestartGuard.ts new file mode 100644 index 000000000..3f33cab09 --- /dev/null +++ b/src/services/worker/RestartGuard.ts @@ -0,0 +1,95 @@ +/** + * Windowed Restart Guard + * + * Replaces the flat `consecutiveRestarts` counter with a time-windowed + * approach. Only restarts within a recent window are counted, so a + * long-running session that occasionally restarts will never hit the + * cap, while a tight crash-loop (persistent FK error, missing session + * ID, etc.) will trip the guard within seconds. + * + * Both `worker-service.ts` and `SessionRoutes.ts` share this module so + * the logic stays in one place. + * + * Issue: Generator restart guard strands pending messages with no recovery + */ + +// --------------------------------------------------------------------------- +// Tunables +// --------------------------------------------------------------------------- + +/** Time window (ms) in which restarts are counted. Restarts older than + * this are pruned and no longer contribute to the count. */ +export const RESTART_WINDOW_MS = 60_000; // 60 seconds + +/** Maximum restarts allowed inside the window before tripping the guard. + * "5 restarts in 60 s" catches tight loops while allowing healthy + * sessions to restart a handful of times per hour without issue. */ +export const MAX_RESTARTS_IN_WINDOW = 5; + +// --------------------------------------------------------------------------- +// Interface +// --------------------------------------------------------------------------- + +/** + * Minimal shape that any object must satisfy to participate in windowed + * restart tracking. `ActiveSession` satisfies this after the type + * update. + */ +export interface RestartTracker { + restartTimestamps: number[]; + consecutiveRestarts: number; +} + +// --------------------------------------------------------------------------- +// Core helpers +// --------------------------------------------------------------------------- + +/** + * Record a restart attempt and decide whether it should be allowed. + * + * 1. Prune timestamps older than `RESTART_WINDOW_MS`. + * 2. Push the current timestamp. + * 3. Sync `consecutiveRestarts` (kept for backward-compat logging). + * 4. Return `true` if the restart is within budget, `false` to block. + * + * @param tracker Session (or test stub) that holds the timestamps. + * @param now Current epoch ms — injectable for deterministic tests. + */ +export function recordRestart( + tracker: RestartTracker, + now: number = Date.now(), +): boolean { + // Prune stale entries + tracker.restartTimestamps = tracker.restartTimestamps.filter( + (ts) => now - ts < RESTART_WINDOW_MS, + ); + + // Record this restart + tracker.restartTimestamps.push(now); + + // Keep legacy field in sync for log output / backcompat + tracker.consecutiveRestarts = tracker.restartTimestamps.length; + + return tracker.restartTimestamps.length <= MAX_RESTARTS_IN_WINDOW; +} + +/** + * Reset the tracker — called on clean completion (no pending work). + */ +export function resetRestarts(tracker: RestartTracker): void { + tracker.restartTimestamps = []; + tracker.consecutiveRestarts = 0; +} + +/** + * Return the number of restarts still inside the current window. + * Useful for logging / diagnostics without mutating the tracker. + */ +export function getRecentRestartCount( + tracker: RestartTracker, + now: number = Date.now(), +): number { + return tracker.restartTimestamps.filter( + (ts) => now - ts < RESTART_WINDOW_MS, + ).length; +} diff --git a/src/services/worker/SessionManager.ts b/src/services/worker/SessionManager.ts index 5ef91aaf4..51982dab7 100644 --- a/src/services/worker/SessionManager.ts +++ b/src/services/worker/SessionManager.ts @@ -218,6 +218,7 @@ export class SessionManager { conversationHistory: [], // Initialize empty - will be populated by agents currentProvider: null, // Will be set when generator starts consecutiveRestarts: 0, // Track consecutive restart attempts to prevent infinite loops + restartTimestamps: [], // Windowed restart tracking (see RestartGuard.ts) processingMessageIds: [], // CLAIM-CONFIRM: Track message IDs for confirmProcessed() lastGeneratorActivity: Date.now() // Initialize for stale detection (Issue #1099) }; diff --git a/src/services/worker/http/routes/SessionRoutes.ts b/src/services/worker/http/routes/SessionRoutes.ts index 21ffbc339..a61e5f105 100644 --- a/src/services/worker/http/routes/SessionRoutes.ts +++ b/src/services/worker/http/routes/SessionRoutes.ts @@ -21,6 +21,7 @@ import { SessionCompletionHandler } from '../../session/SessionCompletionHandler import { PrivacyCheckValidator } from '../../validation/PrivacyCheckValidator.js'; import { SettingsDefaultsManager } from '../../../../shared/SettingsDefaultsManager.js'; import { USER_SETTINGS_PATH } from '../../../../shared/paths.js'; +import { recordRestart, resetRestarts, RESTART_WINDOW_MS, MAX_RESTARTS_IN_WINDOW } from '../../RestartGuard.js'; import { getProcessBySession, ensureProcessExit } from '../../ProcessRegistry.js'; import { getProjectContext } from '../../../../utils/project-name.js'; import { normalizePlatformSource } from '../../../../shared/platform-source.js'; @@ -289,9 +290,9 @@ export class SessionRoutes extends BaseRouteHandler { const pendingStore = this.sessionManager.getPendingMessageStore(); const pendingCount = pendingStore.getPendingCount(sessionDbId); - // CRITICAL: Limit consecutive restarts to prevent infinite loops - // This prevents runaway API costs when there's a persistent error (e.g., memorySessionId not captured) - const MAX_CONSECUTIVE_RESTARTS = 3; + // CRITICAL: Windowed restart guard — only counts restarts within a recent + // time window to prevent tight crash-loops while allowing healthy long-running + // sessions to restart occasionally without hitting the cap. (RestartGuard.ts) if (pendingCount > 0) { // GUARD: Prevent duplicate crash recovery spawns @@ -300,17 +301,29 @@ export class SessionRoutes extends BaseRouteHandler { return; } - session.consecutiveRestarts = (session.consecutiveRestarts || 0) + 1; + // Initialize restartTimestamps for sessions created before the field existed + if (!session.restartTimestamps) { + session.restartTimestamps = []; + } + + const allowed = recordRestart(session); - if (session.consecutiveRestarts > MAX_CONSECUTIVE_RESTARTS) { + if (!allowed) { logger.error('SESSION', `CRITICAL: Generator restart limit exceeded - stopping to prevent runaway costs`, { sessionId: sessionDbId, pendingCount, consecutiveRestarts: session.consecutiveRestarts, - maxRestarts: MAX_CONSECUTIVE_RESTARTS, - action: 'Generator will NOT restart. Check logs for root cause. Messages remain in pending state.' + restartsInWindow: session.restartTimestamps.length, + windowMs: RESTART_WINDOW_MS, + maxRestartsInWindow: MAX_RESTARTS_IN_WINDOW, + action: 'Generator will NOT restart. Messages will be marked abandoned.' + }); + // Mark pending messages as abandoned so they don't strand forever + const abandoned = pendingStore.markAllSessionMessagesAbandoned(sessionDbId); + logger.info('SESSION', 'Marked stranded messages as abandoned after restart guard trip', { + sessionId: sessionDbId, + abandoned }); - // Don't restart - abort to prevent further API calls session.abortController.abort(); return; } @@ -319,7 +332,8 @@ export class SessionRoutes extends BaseRouteHandler { sessionId: sessionDbId, pendingCount, consecutiveRestarts: session.consecutiveRestarts, - maxRestarts: MAX_CONSECUTIVE_RESTARTS + restartsInWindow: session.restartTimestamps.length, + maxRestartsInWindow: MAX_RESTARTS_IN_WINDOW }); // Abort OLD controller before replacing to prevent child process leaks @@ -344,8 +358,8 @@ export class SessionRoutes extends BaseRouteHandler { } else { // No pending work - abort to kill the child process session.abortController.abort(); - // Reset restart counter on successful completion - session.consecutiveRestarts = 0; + // Reset restart tracker on successful completion + resetRestarts(session); logger.debug('SESSION', 'Aborted controller after natural completion', { sessionId: sessionDbId }); diff --git a/tests/services/restart-guard.test.ts b/tests/services/restart-guard.test.ts new file mode 100644 index 000000000..5955deaa3 --- /dev/null +++ b/tests/services/restart-guard.test.ts @@ -0,0 +1,245 @@ +import { describe, it, expect } from 'bun:test'; +import { + recordRestart, + resetRestarts, + getRecentRestartCount, + RESTART_WINDOW_MS, + MAX_RESTARTS_IN_WINDOW, + type RestartTracker, +} from '../../src/services/worker/RestartGuard.js'; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function makeTracker(): RestartTracker { + return { restartTimestamps: [], consecutiveRestarts: 0 }; +} + +// --------------------------------------------------------------------------- +// recordRestart +// --------------------------------------------------------------------------- + +describe('RestartGuard — recordRestart', () => { + it('allows restarts up to the window limit', () => { + const t = makeTracker(); + const now = 1_000_000; + + for (let i = 0; i < MAX_RESTARTS_IN_WINDOW; i++) { + expect(recordRestart(t, now + i)).toBe(true); + } + // One more should be blocked + expect(recordRestart(t, now + MAX_RESTARTS_IN_WINDOW)).toBe(false); + }); + + it('blocks restarts beyond the limit within the same window', () => { + const t = makeTracker(); + const now = 1_000_000; + + // Fill up to limit + for (let i = 0; i <= MAX_RESTARTS_IN_WINDOW; i++) { + recordRestart(t, now + i); + } + // consecutiveRestarts should reflect windowed count + expect(t.consecutiveRestarts).toBe(MAX_RESTARTS_IN_WINDOW + 1); + expect(recordRestart(t, now + MAX_RESTARTS_IN_WINDOW + 1)).toBe(false); + }); + + it('allows restarts once old timestamps expire from the window', () => { + const t = makeTracker(); + const now = 1_000_000; + + // Fire restarts right at the limit + for (let i = 0; i < MAX_RESTARTS_IN_WINDOW; i++) { + recordRestart(t, now + i); + } + + // Next one in the same window is blocked + expect(recordRestart(t, now + 100)).toBe(false); + + // Jump forward past the window — all old timestamps should be pruned + // Need to exceed RESTART_WINDOW_MS from the latest timestamp (now + 100) + const future = now + RESTART_WINDOW_MS + 200; + expect(recordRestart(t, future)).toBe(true); + expect(t.consecutiveRestarts).toBe(1); // only the new one remains + }); + + it('prunes timestamps older than the window on each call', () => { + const t = makeTracker(); + const now = 1_000_000; + + recordRestart(t, now); + recordRestart(t, now + 1_000); + expect(t.restartTimestamps.length).toBe(2); + + // Jump past the window — both old timestamps should be pruned + const future = now + RESTART_WINDOW_MS + 2_000; + recordRestart(t, future); + expect(t.restartTimestamps.length).toBe(1); + expect(t.restartTimestamps[0]).toBe(future); + }); + + it('keeps consecutiveRestarts in sync with windowed count', () => { + const t = makeTracker(); + const now = 1_000_000; + + recordRestart(t, now); + expect(t.consecutiveRestarts).toBe(1); + + recordRestart(t, now + 500); + expect(t.consecutiveRestarts).toBe(2); + + // After window expires, counter should reset to just the new entry + // Need to exceed RESTART_WINDOW_MS from the latest timestamp (now + 500) + recordRestart(t, now + RESTART_WINDOW_MS + 1_000); + expect(t.consecutiveRestarts).toBe(1); + }); +}); + +// --------------------------------------------------------------------------- +// resetRestarts +// --------------------------------------------------------------------------- + +describe('RestartGuard — resetRestarts', () => { + it('clears all timestamps and counter', () => { + const t = makeTracker(); + recordRestart(t); + recordRestart(t); + expect(t.restartTimestamps.length).toBe(2); + expect(t.consecutiveRestarts).toBe(2); + + resetRestarts(t); + expect(t.restartTimestamps.length).toBe(0); + expect(t.consecutiveRestarts).toBe(0); + }); + + it('is idempotent', () => { + const t = makeTracker(); + resetRestarts(t); + resetRestarts(t); + expect(t.restartTimestamps.length).toBe(0); + expect(t.consecutiveRestarts).toBe(0); + }); +}); + +// --------------------------------------------------------------------------- +// getRecentRestartCount +// --------------------------------------------------------------------------- + +describe('RestartGuard — getRecentRestartCount', () => { + it('returns zero for a fresh tracker', () => { + const t = makeTracker(); + expect(getRecentRestartCount(t)).toBe(0); + }); + + it('counts only timestamps within the window', () => { + const t = makeTracker(); + const now = 1_000_000; + + // Two old, one recent + t.restartTimestamps = [ + now - RESTART_WINDOW_MS - 100, + now - RESTART_WINDOW_MS - 50, + now - 500, + ]; + + expect(getRecentRestartCount(t, now)).toBe(1); + }); + + it('does not mutate the tracker', () => { + const t = makeTracker(); + t.restartTimestamps = [Date.now() - RESTART_WINDOW_MS - 100, Date.now()]; + const before = [...t.restartTimestamps]; + getRecentRestartCount(t); + expect(t.restartTimestamps).toEqual(before); + }); +}); + +// --------------------------------------------------------------------------- +// Acceptance criteria +// --------------------------------------------------------------------------- + +describe('RestartGuard — acceptance criteria', () => { + it('a session that restarts a few times per hour is NOT terminated', () => { + const t = makeTracker(); + + // Simulate: 3 restarts spread across 2 hours (one every 40 minutes) + let now = 1_000_000; + expect(recordRestart(t, now)).toBe(true); + + now += 40 * 60_000; // +40 min + expect(recordRestart(t, now)).toBe(true); + expect(t.consecutiveRestarts).toBe(1); // previous one expired + + now += 40 * 60_000; // +40 min + expect(recordRestart(t, now)).toBe(true); + expect(t.consecutiveRestarts).toBe(1); + }); + + it('a tight crash-loop trips the guard within seconds', () => { + const t = makeTracker(); + const now = 1_000_000; + + // Simulate: 6 immediate restarts, 1 second apart + let blocked = false; + for (let i = 0; i < 10; i++) { + if (!recordRestart(t, now + i * 1_000)) { + blocked = true; + break; + } + } + expect(blocked).toBe(true); + // Should have blocked at attempt MAX_RESTARTS_IN_WINDOW + 1 + expect(t.consecutiveRestarts).toBe(MAX_RESTARTS_IN_WINDOW + 1); + }); + + it('restarts that happened within a short window are counted together', () => { + const t = makeTracker(); + const now = 1_000_000; + + // 4 restarts in 10 seconds — below MAX but clustered + for (let i = 0; i < 4; i++) { + recordRestart(t, now + i * 2_500); + } + expect(t.consecutiveRestarts).toBe(4); + + // 5th is still within window and at the limit + expect(recordRestart(t, now + 12_000)).toBe(true); + expect(t.consecutiveRestarts).toBe(5); + + // 6th trips it + expect(recordRestart(t, now + 15_000)).toBe(false); + }); + + it('guard resets naturally as time passes without needing clean completion', () => { + const t = makeTracker(); + const now = 1_000_000; + + // Hit the limit + for (let i = 0; i <= MAX_RESTARTS_IN_WINDOW; i++) { + recordRestart(t, now + i); + } + // Guard is tripped + expect(t.consecutiveRestarts).toBe(MAX_RESTARTS_IN_WINDOW + 1); + + // Wait for the window to expire + const future = now + RESTART_WINDOW_MS + 100; + const allowed = recordRestart(t, future); + expect(allowed).toBe(true); + expect(t.consecutiveRestarts).toBe(1); + }); +}); + +// --------------------------------------------------------------------------- +// Constants sanity checks +// --------------------------------------------------------------------------- + +describe('RestartGuard — constants', () => { + it('window is 60 seconds', () => { + expect(RESTART_WINDOW_MS).toBe(60_000); + }); + + it('max restarts in window is 5', () => { + expect(MAX_RESTARTS_IN_WINDOW).toBe(5); + }); +});