diff --git a/apps/stage-tamagotchi/src/main/index.ts b/apps/stage-tamagotchi/src/main/index.ts index e87d06fc46..585fa2f922 100644 --- a/apps/stage-tamagotchi/src/main/index.ts +++ b/apps/stage-tamagotchi/src/main/index.ts @@ -36,6 +36,7 @@ import { setupAboutWindowReusable } from './windows/about' import { setupBeatSync } from './windows/beat-sync' import { setupCaptionWindowManager } from './windows/caption' import { setupChatWindowReusableFunc } from './windows/chat' +import { isDesktopOverlayEnabled, setupDesktopOverlayWindow } from './windows/desktop-overlay' import { setupDevtoolsWindow } from './windows/devtools' import { setupMainWindow } from './windows/main' import { setupNoticeWindowManager } from './windows/notice' @@ -194,6 +195,22 @@ app.whenReady().then(async () => { build: async ({ dependsOn }) => setupTray(dependsOn), }) + // Desktop grounding overlay — gated by AIRI_DESKTOP_OVERLAY=1 + if (isDesktopOverlayEnabled()) { + const desktopOverlay = injeca.provide('windows:desktop-overlay', { + dependsOn: { mcpStdioManager, serverChannel, i18n }, + build: async ({ dependsOn }) => setupDesktopOverlayWindow(dependsOn), + }) + + // NOTICE: Separate invoke ensures the overlay is eagerly built. + // Without this, injeca.start() would skip it because no other + // provider depends on 'windows:desktop-overlay'. + injeca.invoke({ + dependsOn: { desktopOverlay }, + callback: noop, + }) + } + injeca.invoke({ dependsOn: { mainWindow, tray, serverChannel, airiHttpServer, pluginHost, mcpStdioManager, onboardingWindow: onboardingWindowManager, widgetsWindow: widgetsManager, artistryConfig }, callback: async (deps) => { diff --git a/apps/stage-tamagotchi/src/main/windows/desktop-overlay/index.ts b/apps/stage-tamagotchi/src/main/windows/desktop-overlay/index.ts new file mode 100644 index 0000000000..4945593de6 --- /dev/null +++ b/apps/stage-tamagotchi/src/main/windows/desktop-overlay/index.ts @@ -0,0 +1,148 @@ +/** + * Desktop Grounding Overlay — transparent always-on-top window + * + * Renders: + * - Ghost pointer dot at the snap-resolved click position + * - Bounding box around the matched target candidate + * - Source label + confidence badge + * - Stale flags + * + * Gated by AIRI_DESKTOP_OVERLAY=1 environment variable. + * When disabled, this module is a no-op. + * + * Data flow (v1): + * - The overlay renderer polls `computer_use::desktop_get_state` via the MCP bridge + * - No IPC push from main process to renderer + * - No Eventa channels or server push + * + * The overlay is click-through (setIgnoreMouseEvents) so it never + * intercepts real user or OS-level click events. + */ + +import type { I18n } from '../../libs/i18n' +import type { ServerChannel } from '../../services/airi/channel-server' +import type { McpStdioManager } from '../../services/airi/mcp-servers' + +import { join, resolve } from 'node:path' + +import { BrowserWindow, screen } from 'electron' + +import { baseUrl, getElectronMainDirname, load, withHashRoute } from '../../libs/electron/location' +import { setupDesktopOverlayElectronInvokes } from './rpc/index.electron' + +/** Whether the desktop overlay feature is enabled */ +export function isDesktopOverlayEnabled(): boolean { + return process.env.AIRI_DESKTOP_OVERLAY === '1' +} + +let overlayWindow: BrowserWindow | null = null + +/** + * Create the transparent overlay window covering the full primary display. + * The window is: + * - Always on top (screen level) + * - Click-through (ignoreMouseEvents) + * - Transparent and frameless + * - Not shown in taskbar / dock + * + * Returns null if AIRI_DESKTOP_OVERLAY is not set. + */ +export async function setupDesktopOverlayWindow(params: { + mcpStdioManager: McpStdioManager + serverChannel: ServerChannel + i18n: I18n +}): Promise { + if (!isDesktopOverlayEnabled()) { + return null + } + + // Use primary display bounds (not just size) — the origin may be non-zero + // when multiple displays are arranged in macOS Display Preferences. + const primaryDisplay = screen.getPrimaryDisplay() + const { x, y, width, height } = primaryDisplay.bounds + + overlayWindow = new BrowserWindow({ + title: 'AIRI Desktop Overlay', + width, + height, + x, + y, + show: false, + frame: false, + transparent: true, + alwaysOnTop: true, + skipTaskbar: true, + hasShadow: false, + // Round corners off for pixel-accurate overlay + roundedCorners: false, + // Prevent the overlay from stealing focus + focusable: false, + webPreferences: { + preload: join(getElectronMainDirname(), '../preload/index.mjs'), + sandbox: false, + // Disable background throttling so animations stay smooth + backgroundThrottling: false, + }, + }) + + // Make click-through: all mouse events pass through to the desktop + overlayWindow.setIgnoreMouseEvents(true, { forward: true }) + + // Set to screen level (above all other windows) + overlayWindow.setAlwaysOnTop(true, 'screen-saver') + + // Prevent the window from appearing in screenshots/recordings if possible + overlayWindow.setContentProtection(true) + + // Hide from Mission Control / Exposé on macOS + overlayWindow.setVisibleOnAllWorkspaces(true, { visibleOnFullScreen: true }) + + overlayWindow.on('ready-to-show', () => { + overlayWindow?.show() + }) + + overlayWindow.on('closed', () => { + overlayWindow = null + }) + + // NOTICE: Wire eventa RPC BEFORE loading the renderer page. + // The overlay's onMounted fires during load() and immediately starts + // polling via callTool. If the handlers aren't registered yet, the + // first eventa invoke hangs forever (no response dispatched back to + // this window), and all subsequent poll cycles never fire because + // the poll loop awaits each call sequentially. + await setupDesktopOverlayElectronInvokes({ + window: overlayWindow, + mcpStdioManager: params.mcpStdioManager, + serverChannel: params.serverChannel, + i18n: params.i18n, + }) + + // Load the overlay renderer page + await load( + overlayWindow, + withHashRoute( + baseUrl(resolve(getElectronMainDirname(), '..', 'renderer')), + '/desktop-overlay', + ), + ) + + return overlayWindow +} + +/** + * Get the current overlay window instance (if active). + */ +export function getDesktopOverlayWindow(): BrowserWindow | null { + return overlayWindow +} + +/** + * Tear down the overlay window. + */ +export function destroyDesktopOverlay(): void { + if (overlayWindow && !overlayWindow.isDestroyed()) { + overlayWindow.close() + overlayWindow = null + } +} diff --git a/apps/stage-tamagotchi/src/main/windows/desktop-overlay/rpc/index.electron.ts b/apps/stage-tamagotchi/src/main/windows/desktop-overlay/rpc/index.electron.ts new file mode 100644 index 0000000000..64b674635f --- /dev/null +++ b/apps/stage-tamagotchi/src/main/windows/desktop-overlay/rpc/index.electron.ts @@ -0,0 +1,39 @@ +/** + * Desktop Overlay Window — Electron RPC bootstrap + * + * Minimal eventa context setup for the overlay BrowserWindow. + * Only registers base window services and MCP tool services — + * the overlay only needs callTool/listTools for polling + * `computer_use::desktop_get_state`. + * + * Follows the same pattern as main/chat/settings window RPC setups. + */ + +import type { BrowserWindow } from 'electron' + +import type { I18n } from '../../../libs/i18n' +import type { ServerChannel } from '../../../services/airi/channel-server' +import type { McpStdioManager } from '../../../services/airi/mcp-servers' + +import { createContext } from '@moeru/eventa/adapters/electron/main' +import { ipcMain } from 'electron' + +import { createMcpServersService } from '../../../services/airi/mcp-servers' +import { setupBaseWindowElectronInvokes } from '../../shared/window' + +export async function setupDesktopOverlayElectronInvokes(params: { + window: BrowserWindow + mcpStdioManager: McpStdioManager + serverChannel: ServerChannel + i18n: I18n +}) { + // TODO: once we refactored eventa to support window-namespaced contexts, + // we can remove the setMaxListeners call below since eventa will be able to dispatch and + // manage events within eventa's context system. + ipcMain.setMaxListeners(0) + + const { context } = createContext(ipcMain, params.window) + + await setupBaseWindowElectronInvokes({ context, window: params.window, i18n: params.i18n, serverChannel: params.serverChannel }) + createMcpServersService({ context, manager: params.mcpStdioManager }) +} diff --git a/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-coordinates.test.ts b/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-coordinates.test.ts new file mode 100644 index 0000000000..f2ff64f3dc --- /dev/null +++ b/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-coordinates.test.ts @@ -0,0 +1,120 @@ +import { describe, expect, it } from 'vitest' + +import { + pointInOverlay, + rectIntersectsOverlay, + screenRectToLocal, + screenToLocal, +} from './desktop-overlay-coordinates' + +// --------------------------------------------------------------------------- +// screenToLocal +// --------------------------------------------------------------------------- + +describe('screenToLocal', () => { + it('subtracts overlay origin from screen point', () => { + const result = screenToLocal({ x: 500, y: -800 }, { x: 0, y: -1080 }) + expect(result).toEqual({ x: 500, y: 280 }) + }) + + it('is identity when overlay origin is (0,0)', () => { + const result = screenToLocal({ x: 100, y: 200 }, { x: 0, y: 0 }) + expect(result).toEqual({ x: 100, y: 200 }) + }) + + it('handles negative overlay origin', () => { + const result = screenToLocal({ x: 441, y: -1037 }, { x: 0, y: -1080 }) + expect(result).toEqual({ x: 441, y: 43 }) + }) +}) + +// --------------------------------------------------------------------------- +// screenRectToLocal +// --------------------------------------------------------------------------- + +describe('screenRectToLocal', () => { + it('shifts rect origin, preserves size', () => { + const result = screenRectToLocal( + { x: 100, y: -1000, width: 80, height: 30 }, + { x: 0, y: -1080 }, + ) + expect(result).toEqual({ x: 100, y: 80, width: 80, height: 30 }) + }) + + it('is identity when overlay origin is (0,0)', () => { + const rect = { x: 50, y: 100, width: 200, height: 150 } + const result = screenRectToLocal(rect, { x: 0, y: 0 }) + expect(result).toEqual(rect) + }) +}) + +// --------------------------------------------------------------------------- +// rectIntersectsOverlay +// --------------------------------------------------------------------------- + +describe('rectIntersectsOverlay', () => { + const overlay = { x: 0, y: -1080, width: 1440, height: 900 } + + it('returns true for rect fully inside overlay', () => { + expect(rectIntersectsOverlay( + { x: 100, y: -1000, width: 80, height: 30 }, + overlay, + )).toBe(true) + }) + + it('returns true for rect partially overlapping', () => { + expect(rectIntersectsOverlay( + { x: 1400, y: -1080, width: 100, height: 50 }, + overlay, + )).toBe(true) + }) + + it('returns false for rect entirely above overlay', () => { + expect(rectIntersectsOverlay( + { x: 100, y: -2000, width: 80, height: 30 }, + overlay, + )).toBe(false) + }) + + it('returns false for rect entirely below overlay', () => { + expect(rectIntersectsOverlay( + { x: 100, y: 0, width: 80, height: 30 }, + overlay, + )).toBe(false) + }) + + it('returns false for rect entirely to the right', () => { + expect(rectIntersectsOverlay( + { x: 1500, y: -500, width: 80, height: 30 }, + overlay, + )).toBe(false) + }) +}) + +// --------------------------------------------------------------------------- +// pointInOverlay +// --------------------------------------------------------------------------- + +describe('pointInOverlay', () => { + const overlay = { x: 0, y: -1080, width: 1440, height: 900 } + + it('returns true for point inside', () => { + expect(pointInOverlay({ x: 720, y: -540 }, overlay)).toBe(true) + }) + + it('returns true for point at top-left corner', () => { + expect(pointInOverlay({ x: 0, y: -1080 }, overlay)).toBe(true) + }) + + it('returns false for point outside (below)', () => { + expect(pointInOverlay({ x: 720, y: 0 }, overlay)).toBe(false) + }) + + it('returns false for point outside (above)', () => { + expect(pointInOverlay({ x: 720, y: -1200 }, overlay)).toBe(false) + }) + + it('returns false for point outside (right)', () => { + expect(pointInOverlay({ x: 1500, y: -540 }, overlay)).toBe(false) + }) +}) diff --git a/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-coordinates.ts b/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-coordinates.ts new file mode 100644 index 0000000000..3c57870bed --- /dev/null +++ b/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-coordinates.ts @@ -0,0 +1,80 @@ +/** + * Desktop Overlay Coordinates — screen-absolute to overlay-local mapping. + * + * The computer-use-mcp returns all bounding boxes and points in + * screen-absolute logical pixels. The overlay window covers a single + * display whose origin may be non-zero (e.g. y = -1080 when a display + * is stacked above the primary). + * + * This module provides pure functions to: + * 1. Convert screen-absolute coords to overlay-local coords + * 2. Filter out candidates whose bounds don't intersect the overlay + */ + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export interface Rect { + x: number + y: number + width: number + height: number +} + +export interface Point { + x: number + y: number +} + +// --------------------------------------------------------------------------- +// Coordinate mapping +// --------------------------------------------------------------------------- + +/** + * Convert a screen-absolute point to overlay-local coordinates. + */ +export function screenToLocal(point: Point, overlayOrigin: Point): Point { + return { + x: point.x - overlayOrigin.x, + y: point.y - overlayOrigin.y, + } +} + +/** + * Convert a screen-absolute rect to overlay-local coordinates. + * Size is preserved; only the origin is shifted. + */ +export function screenRectToLocal(rect: Rect, overlayOrigin: Point): Rect { + return { + x: rect.x - overlayOrigin.x, + y: rect.y - overlayOrigin.y, + width: rect.width, + height: rect.height, + } +} + +/** + * Check whether a screen-absolute rect intersects the overlay bounds. + * Used to filter out candidates that are entirely on another display. + */ +export function rectIntersectsOverlay(rect: Rect, overlayBounds: Rect): boolean { + return ( + rect.x < overlayBounds.x + overlayBounds.width + && rect.x + rect.width > overlayBounds.x + && rect.y < overlayBounds.y + overlayBounds.height + && rect.y + rect.height > overlayBounds.y + ) +} + +/** + * Check whether a screen-absolute point is within the overlay bounds. + */ +export function pointInOverlay(point: Point, overlayBounds: Rect): boolean { + return ( + point.x >= overlayBounds.x + && point.x < overlayBounds.x + overlayBounds.width + && point.y >= overlayBounds.y + && point.y < overlayBounds.y + overlayBounds.height + ) +} diff --git a/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-polling.test.ts b/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-polling.test.ts new file mode 100644 index 0000000000..9b3cda1e72 --- /dev/null +++ b/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-polling.test.ts @@ -0,0 +1,441 @@ +import type { McpCallToolResult } from '@proj-airi/stage-ui/tools/mcp' + +import type { OverlayState } from './desktop-overlay-polling' + +import { afterEach, describe, expect, it, vi } from 'vitest' + +import { + createEmptyOverlayState, + createOverlayPollController, + extractOverlayState, + extractRunStateFromResult, + MCP_TOOL_NAME, +} from './desktop-overlay-polling' + +// --------------------------------------------------------------------------- +// extractOverlayState +// --------------------------------------------------------------------------- + +describe('extractOverlayState', () => { + it('returns empty state when runState has no grounding data', () => { + const result = extractOverlayState({}) + expect(result.hasSnapshot).toBe(false) + expect(result.snapshotId).toBe('') + expect(result.candidates).toEqual([]) + expect(result.pointerIntent).toBeNull() + expect(result.staleFlags).toEqual({ screenshot: false, ax: false, chromeSemantic: false }) + }) + + it('extracts candidates from lastGroundingSnapshot', () => { + const result = extractOverlayState({ + lastGroundingSnapshot: { + snapshotId: 'dg_42', + targetCandidates: [ + { id: 't_0', source: 'chrome_dom', role: 'button', label: 'Submit', bounds: { x: 100, y: 200, width: 80, height: 30 }, confidence: 0.95 }, + { id: 't_1', source: 'ax', role: 'link', label: 'Help', bounds: { x: 300, y: 100, width: 40, height: 20 }, confidence: 0.7 }, + ], + staleFlags: { screenshot: false, ax: false, chromeSemantic: false }, + }, + }) + + expect(result.hasSnapshot).toBe(true) + expect(result.snapshotId).toBe('dg_42') + expect(result.candidates).toHaveLength(2) + expect(result.candidates[0].id).toBe('t_0') + expect(result.candidates[1].source).toBe('ax') + }) + + it('extracts pointer intent from lastPointerIntent', () => { + const result = extractOverlayState({ + lastPointerIntent: { + snappedPoint: { x: 140, y: 215 }, + candidateId: 't_0', + source: 'chrome_dom', + confidence: 0.95, + mode: 'execute', + }, + }) + + expect(result.pointerIntent).not.toBeNull() + expect(result.pointerIntent!.snappedPoint).toEqual({ x: 140, y: 215 }) + expect(result.pointerIntent!.candidateId).toBe('t_0') + expect(result.pointerIntent!.mode).toBe('execute') + }) + + it('detects stale flags', () => { + const result = extractOverlayState({ + lastGroundingSnapshot: { + snapshotId: 'dg_1', + targetCandidates: [], + staleFlags: { screenshot: true, ax: false, chromeSemantic: true }, + }, + }) + + expect(result.staleFlags.screenshot).toBe(true) + expect(result.staleFlags.ax).toBe(false) + expect(result.staleFlags.chromeSemantic).toBe(true) + }) + + it('handles snapshot with missing targetCandidates gracefully', () => { + const result = extractOverlayState({ + lastGroundingSnapshot: { + snapshotId: 'dg_1', + // targetCandidates intentionally missing + }, + }) + + expect(result.hasSnapshot).toBe(true) + expect(result.candidates).toEqual([]) + }) +}) + +// --------------------------------------------------------------------------- +// extractRunStateFromResult +// --------------------------------------------------------------------------- + +describe('extractRunStateFromResult', () => { + it('returns undefined for error results', () => { + const result = extractRunStateFromResult({ + isError: true, + content: [{ type: 'text', text: 'fail' }], + }) + expect(result).toBeUndefined() + }) + + it('extracts runState from structuredContent.runState', () => { + const result = extractRunStateFromResult({ + structuredContent: { + runState: { + lastGroundingSnapshot: { snapshotId: 'dg_1' }, + }, + }, + }) + expect(result).toBeDefined() + expect((result as any).lastGroundingSnapshot.snapshotId).toBe('dg_1') + }) + + it('falls back to structuredContent directly when no runState key', () => { + const result = extractRunStateFromResult({ + structuredContent: { + lastGroundingSnapshot: { snapshotId: 'dg_2' }, + }, + }) + expect(result).toBeDefined() + expect((result as any).lastGroundingSnapshot.snapshotId).toBe('dg_2') + }) + + it('returns undefined when structuredContent is missing', () => { + const result = extractRunStateFromResult({}) + expect(result).toBeUndefined() + }) +}) + +// --------------------------------------------------------------------------- +// createEmptyOverlayState +// --------------------------------------------------------------------------- + +describe('createEmptyOverlayState', () => { + it('returns consistent empty shape', () => { + const a = createEmptyOverlayState() + const b = createEmptyOverlayState() + + expect(a).toEqual(b) + expect(a.hasSnapshot).toBe(false) + expect(a.candidates).toEqual([]) + expect(a.pointerIntent).toBeNull() + + // Should not be the same reference (no shared mutation) + a.candidates.push({ id: 'x', source: 'raw', role: 'button', label: 'X', bounds: { x: 0, y: 0, width: 10, height: 10 }, confidence: 1 }) + expect(b.candidates).toHaveLength(0) + }) +}) + +// --------------------------------------------------------------------------- +// createOverlayPollController +// --------------------------------------------------------------------------- + +describe('createOverlayPollController', () => { + afterEach(() => { + vi.useRealTimers() + }) + + it('calls tool and delivers state on successful poll', async () => { + vi.useFakeTimers() + + const mockResult: McpCallToolResult = { + structuredContent: { + runState: { + lastGroundingSnapshot: { + snapshotId: 'dg_poll', + targetCandidates: [ + { id: 't_0', source: 'chrome_dom', role: 'button', label: 'OK', bounds: { x: 10, y: 20, width: 50, height: 25 }, confidence: 0.9 }, + ], + staleFlags: { screenshot: false, ax: false, chromeSemantic: false }, + }, + }, + }, + } + + const callTool = vi.fn<(name: string) => Promise>() + .mockResolvedValue(mockResult) + + const received: OverlayState[] = [] + + const controller = createOverlayPollController({ + callTool, + onState: (s) => { received.push(s) }, + intervalMs: 100, + fallbackIntervalMs: 200, + }) + + controller.start() + + // Let the first poll resolve + await vi.advanceTimersByTimeAsync(0) + + expect(callTool).toHaveBeenCalledWith(MCP_TOOL_NAME) + expect(received).toHaveLength(1) + expect(received[0].hasSnapshot).toBe(true) + expect(received[0].candidates[0].id).toBe('t_0') + + controller.stop() + }) + + it('clears the per-call timeout when the tool resolves before the timeout fires', async () => { + vi.useFakeTimers() + + const callTool = vi.fn<(name: string) => Promise>() + .mockResolvedValue({ structuredContent: {} }) + + const controller = createOverlayPollController({ + callTool, + onState: () => {}, + intervalMs: 100, + callTimeoutMs: 500, + }) + + controller.start() + await vi.advanceTimersByTimeAsync(0) + + // Only the next poll should remain scheduled. The per-call timeout must be cleared. + expect(vi.getTimerCount()).toBe(1) + + controller.stop() + }) + + it('stops polling after stop() is called', async () => { + vi.useFakeTimers() + + const callTool = vi.fn<(name: string) => Promise>() + .mockResolvedValue({ structuredContent: {} }) + + const controller = createOverlayPollController({ + callTool, + onState: () => {}, + intervalMs: 100, + }) + + controller.start() + await vi.advanceTimersByTimeAsync(0) + expect(callTool).toHaveBeenCalledTimes(1) + + controller.stop() + expect(controller.isRunning()).toBe(false) + + // Advance past when next poll would have fired + await vi.advanceTimersByTimeAsync(200) + expect(callTool).toHaveBeenCalledTimes(1) // No additional calls + }) + + it('continues polling after a single failure', async () => { + vi.useFakeTimers() + + const callTool = vi.fn<(name: string) => Promise>() + .mockRejectedValueOnce(new Error('MCP down')) + .mockResolvedValue({ + structuredContent: { + runState: { + lastGroundingSnapshot: { + snapshotId: 'dg_recover', + targetCandidates: [], + staleFlags: { screenshot: false, ax: false, chromeSemantic: false }, + }, + }, + }, + }) + + const received: OverlayState[] = [] + + const controller = createOverlayPollController({ + callTool, + onState: (s) => { received.push(s) }, + intervalMs: 100, + fallbackIntervalMs: 200, + }) + + controller.start() + + // First poll: fails + await vi.advanceTimersByTimeAsync(0) + expect(callTool).toHaveBeenCalledTimes(1) + expect(received).toHaveLength(0) + + // Wait for fallback interval + await vi.advanceTimersByTimeAsync(200) + expect(callTool).toHaveBeenCalledTimes(2) + expect(received).toHaveLength(1) + expect(received[0].snapshotId).toBe('dg_recover') + + controller.stop() + }) + + it('is a no-op to call start() twice', async () => { + vi.useFakeTimers() + + const callTool = vi.fn<(name: string) => Promise>() + .mockResolvedValue({ structuredContent: {} }) + + const controller = createOverlayPollController({ + callTool, + onState: () => {}, + intervalMs: 100, + }) + + controller.start() + controller.start() // Should not double-start + + await vi.advanceTimersByTimeAsync(0) + expect(callTool).toHaveBeenCalledTimes(1) // Not 2 + + controller.stop() + }) + + it('recovers from a hanging callTool via per-call timeout', async () => { + vi.useFakeTimers() + + const callTool = vi.fn<(name: string) => Promise>() + .mockImplementationOnce(() => new Promise(() => {})) + .mockResolvedValue({ + structuredContent: { + runState: { + lastGroundingSnapshot: { + snapshotId: 'dg_after_timeout', + targetCandidates: [], + staleFlags: { screenshot: false, ax: false, chromeSemantic: false }, + }, + }, + }, + }) + + const received: OverlayState[] = [] + + const controller = createOverlayPollController({ + callTool, + onState: (s) => { received.push(s) }, + intervalMs: 100, + fallbackIntervalMs: 200, + callTimeoutMs: 500, + }) + + controller.start() + + // First poll fires immediately, callTool hangs + await vi.advanceTimersByTimeAsync(0) + expect(callTool).toHaveBeenCalledTimes(1) + expect(received).toHaveLength(0) + + // Advance past the timeout and several fallback windows. The controller + // should allow a bounded recovery retry even though the original invoke + // is still hung in the background. + await vi.advanceTimersByTimeAsync(500) + await vi.advanceTimersByTimeAsync(200) + expect(callTool).toHaveBeenCalledTimes(2) + expect(received).toHaveLength(1) + expect(received[0].snapshotId).toBe('dg_after_timeout') + + controller.stop() + }) + + it('caps outstanding timed-out polls to avoid unbounded buildup', async () => { + vi.useFakeTimers() + + const callTool = vi.fn<(name: string) => Promise>() + .mockImplementation(() => new Promise(() => {})) + + const controller = createOverlayPollController({ + callTool, + onState: () => {}, + intervalMs: 100, + fallbackIntervalMs: 200, + callTimeoutMs: 500, + }) + + controller.start() + + await vi.advanceTimersByTimeAsync(0) + expect(callTool).toHaveBeenCalledTimes(1) + + await vi.advanceTimersByTimeAsync(500) + await vi.advanceTimersByTimeAsync(200) + expect(callTool).toHaveBeenCalledTimes(2) + + await vi.advanceTimersByTimeAsync(500) + await vi.advanceTimersByTimeAsync(1000) + expect(callTool).toHaveBeenCalledTimes(2) + + controller.stop() + }) + + it('recovers again once a timed-out hung-call slot lease expires', async () => { + vi.useFakeTimers() + + const callTool = vi.fn<(name: string) => Promise>() + .mockImplementationOnce(() => new Promise(() => {})) + .mockImplementationOnce(() => new Promise(() => {})) + .mockResolvedValue({ + structuredContent: { + runState: { + lastGroundingSnapshot: { + snapshotId: 'dg_after_lease', + targetCandidates: [], + staleFlags: { screenshot: false, ax: false, chromeSemantic: false }, + }, + }, + }, + }) + + const received: OverlayState[] = [] + + const controller = createOverlayPollController({ + callTool, + onState: (state) => { + received.push(state) + }, + intervalMs: 100, + fallbackIntervalMs: 200, + callTimeoutMs: 500, + hungCallLeaseMs: 1000, + }) + + controller.start() + + await vi.advanceTimersByTimeAsync(0) + expect(callTool).toHaveBeenCalledTimes(1) + + await vi.advanceTimersByTimeAsync(500) + await vi.advanceTimersByTimeAsync(200) + expect(callTool).toHaveBeenCalledTimes(2) + + await vi.advanceTimersByTimeAsync(500) + await vi.advanceTimersByTimeAsync(200) + expect(callTool).toHaveBeenCalledTimes(2) + expect(received).toHaveLength(0) + + await vi.advanceTimersByTimeAsync(200) + expect(callTool).toHaveBeenCalledTimes(3) + expect(received).toHaveLength(1) + expect(received[0].snapshotId).toBe('dg_after_lease') + + controller.stop() + }) +}) diff --git a/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-polling.ts b/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-polling.ts new file mode 100644 index 0000000000..1ee88e8f68 --- /dev/null +++ b/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-polling.ts @@ -0,0 +1,269 @@ +/** + * Desktop Overlay Polling — pure logic for MCP state polling and data extraction. + * + * Extracted from desktop-overlay.vue so the core logic can be tested + * without a DOM environment or Vue test-utils. + */ + +import type { McpCallToolResult } from '@proj-airi/stage-ui/tools/mcp' + +// --------------------------------------------------------------------------- +// Types — minimal shapes matching RunState fields the overlay consumes +// --------------------------------------------------------------------------- + +export interface OverlayTargetCandidate { + id: string + source: string + role: string + label: string + bounds: { x: number, y: number, width: number, height: number } + confidence: number +} + +export interface OverlayPointerIntent { + snappedPoint: { x: number, y: number } + candidateId?: string + source: string + confidence: number + mode: string +} + +export interface OverlayStaleFlags { + screenshot: boolean + ax: boolean + chromeSemantic: boolean +} + +export interface OverlayState { + hasSnapshot: boolean + snapshotId: string + candidates: OverlayTargetCandidate[] + staleFlags: OverlayStaleFlags + pointerIntent: OverlayPointerIntent | null +} + +// --------------------------------------------------------------------------- +// State extraction +// --------------------------------------------------------------------------- + +const EMPTY_STALE: OverlayStaleFlags = { screenshot: false, ax: false, chromeSemantic: false } + +/** + * Create a default empty overlay state. + */ +export function createEmptyOverlayState(): OverlayState { + return { + hasSnapshot: false, + snapshotId: '', + candidates: [], + staleFlags: { ...EMPTY_STALE }, + pointerIntent: null, + } +} + +/** + * Extract overlay-relevant data from MCP runState. + * Returns a new OverlayState — does not mutate input. + * + * This is the single source of truth for "what does the overlay show?" + */ +export function extractOverlayState(runState: Record): OverlayState { + const result = createEmptyOverlayState() + + // Extract grounding snapshot + const snapshot = runState.lastGroundingSnapshot as Record | undefined + if (snapshot) { + result.hasSnapshot = true + result.snapshotId = (snapshot.snapshotId as string) || '' + result.candidates = (snapshot.targetCandidates as OverlayTargetCandidate[]) ?? [] + result.staleFlags = (snapshot.staleFlags as OverlayStaleFlags) ?? { ...EMPTY_STALE } + } + + // Extract pointer intent + const rawIntent = runState.lastPointerIntent as OverlayPointerIntent | undefined + result.pointerIntent = rawIntent ?? null + + return result +} + +/** + * Extract runState from an MCP call result. + * Returns undefined if the result is an error or has no structured content. + */ +export function extractRunStateFromResult(result: McpCallToolResult): Record | undefined { + if (result.isError) + return undefined + + const sc = result.structuredContent + if (!sc || typeof sc !== 'object') + return undefined + + // desktop_get_state returns { runState: { ... } } or the state directly + if ('runState' in sc && sc.runState && typeof sc.runState === 'object') { + return sc.runState as Record + } + + return sc as Record +} + +// --------------------------------------------------------------------------- +// Polling controller (framework-agnostic) +// --------------------------------------------------------------------------- + +export interface OverlayPollController { + /** Start polling. No-op if already running. */ + start: () => void + /** Stop polling. */ + stop: () => void + /** Whether the controller is actively polling. */ + isRunning: () => boolean +} + +export interface OverlayPollConfig { + /** Function to call MCP tool. */ + callTool: (name: string) => Promise + /** Callback with extracted state on each successful poll. */ + onState: (state: OverlayState) => void + /** Normal poll interval in ms. Default: 250. */ + intervalMs?: number + /** Fallback interval on error in ms. Default: 500. */ + fallbackIntervalMs?: number + /** Per-call timeout in ms. Default: 5000. Prevents poll loop hang on startup race. */ + callTimeoutMs?: number + /** How long a timed-out background call occupies a recovery slot before we probe again. */ + hungCallLeaseMs?: number +} + +const DEFAULT_INTERVAL = 250 +const DEFAULT_FALLBACK_INTERVAL = 500 +const DEFAULT_CALL_TIMEOUT = 5000 +const DEFAULT_HUNG_CALL_LEASE = 5000 +const MAX_BACKGROUND_HUNG_CALLS = 2 + +/** + * MCP server name for computer-use-mcp. Matches the key in mcp.json. + */ +export const MCP_TOOL_NAME = 'computer_use::desktop_get_state' + +/** + * Create a polling controller that periodically calls desktop_get_state + * and extracts overlay state. + */ +export function createOverlayPollController(config: OverlayPollConfig): OverlayPollController { + const normalInterval = config.intervalMs ?? DEFAULT_INTERVAL + const fallbackInterval = config.fallbackIntervalMs ?? DEFAULT_FALLBACK_INTERVAL + const hungCallLeaseMs = config.hungCallLeaseMs ?? DEFAULT_HUNG_CALL_LEASE + + let timer: ReturnType | null = null + let running = false + let inFlightCall: Promise | null = null + let backgroundHungSlots: Array<{ expiresAt: number }> = [] + + function scheduleNext(nextInterval: number) { + if (running) { + timer = setTimeout(poll, nextInterval) + } + } + + function pruneHungCallSlots(now: number) { + backgroundHungSlots = backgroundHungSlots.filter(slot => slot.expiresAt > now) + } + + async function poll() { + pruneHungCallSlots(Date.now()) + + if (inFlightCall || backgroundHungSlots.length >= MAX_BACKGROUND_HUNG_CALLS) { + scheduleNext(fallbackInterval) + return + } + + let nextInterval = normalInterval + let timeoutId: ReturnType | undefined + + try { + // NOTICE: Wrap callTool with a timeout to prevent the poll loop from + // hanging forever if the eventa invoke never resolves (e.g. during + // startup when the main-process RPC handlers may not be ready yet). + // NOTICE: The bridge does not expose abort semantics, so a timed-out + // call may still be pending in the background. We therefore track + // timed-out calls as expiring lease slots: the cap bounds how many + // unrecoverable invokes we tolerate at once, while lease expiry still + // lets the overlay probe again after a cooling-off window. + let timedOutSlot: { expiresAt: number } | null = null + const currentCall = config.callTool(MCP_TOOL_NAME) + inFlightCall = currentCall + currentCall.then(() => { + if (timedOutSlot) { + backgroundHungSlots = backgroundHungSlots.filter(slot => slot !== timedOutSlot) + } + else if (inFlightCall === currentCall) { + inFlightCall = null + } + }, () => { + if (timedOutSlot) { + backgroundHungSlots = backgroundHungSlots.filter(slot => slot !== timedOutSlot) + } + else if (inFlightCall === currentCall) { + inFlightCall = null + } + }) + + const result = await Promise.race([ + currentCall, + new Promise((_, reject) => + timeoutId = setTimeout(() => { + timedOutSlot = { + expiresAt: Date.now() + hungCallLeaseMs, + } + backgroundHungSlots = [...backgroundHungSlots, timedOutSlot] + if (inFlightCall === currentCall) { + inFlightCall = null + } + reject(new Error('callTool timeout')) + }, config.callTimeoutMs ?? DEFAULT_CALL_TIMEOUT), + ), + ]) + const runState = extractRunStateFromResult(result) + + if (runState) { + config.onState(extractOverlayState(runState)) + } + else { + nextInterval = fallbackInterval + } + } + catch { + // MCP server not running, bridge disconnected, or timeout — graceful degradation + nextInterval = fallbackInterval + } + finally { + if (timeoutId !== undefined) { + clearTimeout(timeoutId) + } + } + + scheduleNext(nextInterval) + } + + return { + start() { + if (running) + return + running = true + // Start first poll immediately + poll() + }, + + stop() { + running = false + if (timer !== null) { + clearTimeout(timer) + timer = null + } + }, + + isRunning() { + return running + }, + } +} diff --git a/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay.vue b/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay.vue new file mode 100644 index 0000000000..852d4e223c --- /dev/null +++ b/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay.vue @@ -0,0 +1,290 @@ + + + + + diff --git a/packages/stage-ui/src/stores/index.ts b/packages/stage-ui/src/stores/index.ts index 15bcdeb187..e587ce8bae 100644 --- a/packages/stage-ui/src/stores/index.ts +++ b/packages/stage-ui/src/stores/index.ts @@ -1,5 +1,9 @@ export * from './background' export * from './display-models' +// NOTICE: `@proj-airi/stage-ui/stores` remains a valid package export path. +// Keep this barrel file pointing at real store modules so package resolution +// and typecheck stay valid even when consumers should prefer explicit subpaths. +export * from './mcp' export * from './modules/airi-card' export * from './modules/artistry' export * from './modules/consciousness' diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 31e0627a4f..5e0a84585c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -598,10 +598,10 @@ importers: dependencies: '@better-auth/drizzle-adapter': specifier: ^1.6.5 - version: 1.6.6(@better-auth/core@1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.1.8(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0))(@better-auth/utils@0.3.0)(drizzle-orm@0.45.2(@electric-sql/pglite@0.4.4)(@opentelemetry/api@1.9.1)(@prisma/client@5.22.0)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)(postgres@3.4.9)) + version: 1.6.6(@better-auth/core@1.6.6(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0))(@better-auth/utils@0.4.0)(drizzle-orm@0.45.2(@electric-sql/pglite@0.4.4)(@opentelemetry/api@1.9.1)(@prisma/client@5.22.0)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)(postgres@3.4.9)) '@better-auth/oauth-provider': specifier: 'catalog:' - version: 1.5.6(@better-auth/core@1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.1.8(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0))(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-auth@1.6.6(@opentelemetry/api@1.9.1)(@prisma/client@5.22.0)(better-sqlite3@12.9.0)(drizzle-kit@0.31.10)(drizzle-orm@0.45.2(@electric-sql/pglite@0.4.4)(@opentelemetry/api@1.9.1)(@prisma/client@5.22.0)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)(postgres@3.4.9))(pg@8.20.0)(vitest@4.1.5)(vue@3.5.33(typescript@5.9.3)))(better-call@1.1.8(zod@4.3.6)) + version: 1.5.6(@better-auth/core@1.6.6(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0))(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(better-auth@1.6.6(@opentelemetry/api@1.9.1)(@prisma/client@5.22.0)(better-sqlite3@12.9.0)(drizzle-kit@0.31.10)(drizzle-orm@0.45.2(@electric-sql/pglite@0.4.4)(@opentelemetry/api@1.9.1)(@prisma/client@5.22.0)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)(postgres@3.4.9))(pg@8.20.0)(vitest@4.1.5)(vue@3.5.33(typescript@5.9.3)))(better-call@1.3.5(zod@4.3.6)) '@dotenvx/dotenvx': specifier: ^1.61.1 version: 1.61.4 @@ -722,7 +722,7 @@ importers: devDependencies: '@better-auth/cli': specifier: ^1.4.21 - version: 1.4.21(@better-fetch/fetch@1.1.21)(@electric-sql/pglite@0.4.4)(@opentelemetry/api@1.9.1)(better-call@1.1.8(zod@4.3.6))(drizzle-kit@0.31.10)(jose@6.2.2)(kysely@0.28.16)(magicast@0.5.2)(nanostores@1.3.0)(postgres@3.4.9)(vitest@4.1.5)(vue@3.5.33(typescript@5.9.3)) + version: 1.4.21(@better-fetch/fetch@1.1.21)(@electric-sql/pglite@0.4.4)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(drizzle-kit@0.31.10)(jose@6.2.2)(kysely@0.28.16)(magicast@0.5.2)(nanostores@1.3.0)(postgres@3.4.9)(vitest@4.1.5)(vue@3.5.33(typescript@5.9.3)) '@types/pg': specifier: ^8.20.0 version: 8.20.0 @@ -1423,7 +1423,7 @@ importers: version: 3.0.2(electron@41.2.2) '@electron-toolkit/tsconfig': specifier: ^2.0.0 - version: 2.0.0(@types/node@25.6.0) + version: 2.0.0(@types/node@24.12.2) '@electron-toolkit/utils': specifier: ^4.0.0 version: 4.0.0(electron@41.2.2) @@ -1462,7 +1462,7 @@ importers: version: 3.1.0 '@intlify/unplugin-vue-i18n': specifier: ^11.0.7 - version: 11.0.7(@vue/compiler-dom@3.5.33)(eslint@10.2.1(jiti@2.6.1))(rollup@4.60.2)(typescript@5.9.3)(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue-i18n@11.3.2(vue@3.5.33(typescript@5.9.3)))(vue@3.5.33(typescript@5.9.3)) + version: 11.0.7(@vue/compiler-dom@3.5.33)(eslint@10.2.1(jiti@2.6.1))(rollup@4.60.2)(typescript@5.9.3)(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue-i18n@11.3.2(vue@3.5.33(typescript@5.9.3)))(vue@3.5.33(typescript@5.9.3)) '@modelcontextprotocol/sdk': specifier: 'catalog:' version: 1.29.0(zod@4.3.6) @@ -1498,10 +1498,10 @@ importers: version: link:../../packages/ui-transitions '@proj-airi/unplugin-fetch': specifier: 'catalog:' - version: 0.2.3(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) + version: 0.2.3(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) '@proj-airi/unplugin-live2d-sdk': specifier: ^0.1.7 - version: 0.1.7(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + version: 0.1.7(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) '@types/audioworklet': specifier: 'catalog:' version: 0.0.97 @@ -1528,7 +1528,7 @@ importers: version: 2.10.3 '@vitejs/plugin-vue': specifier: ^6.0.6 - version: 6.0.6(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue@3.5.33(typescript@5.9.3)) + version: 6.0.6(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue@3.5.33(typescript@5.9.3)) '@vue-macros/volar': specifier: ^3.1.2 version: 3.1.2(typescript@5.9.3)(vue-tsc@3.2.7(typescript@5.9.3))(vue@3.5.33(typescript@5.9.3)) @@ -1561,7 +1561,7 @@ importers: version: 6.8.3 electron-vite: specifier: ^5.0.0 - version: 5.0.0(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) + version: 5.0.0(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) get-port-please: specifier: 'catalog:' version: 3.2.0 @@ -1582,31 +1582,31 @@ importers: version: 2.2.6 unocss-preset-scrollbar: specifier: ^4.0.0 - version: 4.0.0(unocss@66.6.8(@emnapi/core@1.9.2)(@emnapi/runtime@1.10.0)(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))) + version: 4.0.0(unocss@66.6.8(@emnapi/core@1.9.2)(@emnapi/runtime@1.10.0)(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))) unplugin-info: specifier: ^1.3.2 - version: 1.3.2(esbuild@0.27.7)(rollup@4.60.2)(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) + version: 1.3.2(esbuild@0.27.7)(rollup@4.60.2)(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) unplugin-yaml: specifier: ^4.1.0 - version: 4.1.0(@nuxt/kit@3.21.2(magicast@0.5.2))(esbuild@0.27.7)(rolldown@1.0.0-rc.16)(rollup@4.60.2)(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) + version: 4.1.0(esbuild@0.27.7)(rolldown@1.0.0-rc.16)(rollup@4.60.2)(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) vite: specifier: 'catalog:' - version: 8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + version: 8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) vite-bundle-visualizer: specifier: ^1.2.1 version: 1.2.1(rolldown@1.0.0-rc.16)(rollup@4.60.2) vite-plugin-mkcert: specifier: 'catalog:' - version: 2.0.0(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) + version: 2.0.0(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) vite-plugin-vue-devtools: specifier: ^8.1.1 - version: 8.1.1(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue@3.5.33(typescript@5.9.3)) + version: 8.1.1(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue@3.5.33(typescript@5.9.3)) vite-plugin-vue-layouts: specifier: ^0.11.0 - version: 0.11.0(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue-router@5.0.5(@vue/compiler-sfc@3.5.33)(pinia@3.0.4(typescript@5.9.3)(vue@3.5.33(typescript@5.9.3)))(vue@3.5.33(typescript@5.9.3)))(vue@3.5.33(typescript@5.9.3)) + version: 0.11.0(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue-router@5.0.5(@vue/compiler-sfc@3.5.33)(pinia@3.0.4(typescript@5.9.3)(vue@3.5.33(typescript@5.9.3)))(vue@3.5.33(typescript@5.9.3)))(vue@3.5.33(typescript@5.9.3)) vue-macros: specifier: ^3.1.2 - version: 3.1.2(@emnapi/core@1.9.2)(@emnapi/runtime@1.10.0)(@vueuse/core@14.2.1(vue@3.5.33(typescript@5.9.3)))(esbuild@0.27.7)(rolldown@1.0.0-rc.16)(rollup@4.60.2)(typescript@5.9.3)(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue-tsc@3.2.7(typescript@5.9.3))(vue@3.5.33(typescript@5.9.3)) + version: 3.1.2(@emnapi/core@1.9.2)(@emnapi/runtime@1.10.0)(@vueuse/core@14.2.1(vue@3.5.33(typescript@5.9.3)))(esbuild@0.27.7)(rolldown@1.0.0-rc.16)(rollup@4.60.2)(typescript@5.9.3)(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue-tsc@3.2.7(typescript@5.9.3))(vue@3.5.33(typescript@5.9.3)) vue-tsc: specifier: ^3.2.6 version: 3.2.7(typescript@5.9.3) @@ -18871,13 +18871,13 @@ snapshots: '@bcoe/v8-coverage@1.0.2': {} - '@better-auth/cli@1.4.21(@better-fetch/fetch@1.1.21)(@electric-sql/pglite@0.4.4)(@opentelemetry/api@1.9.1)(better-call@1.1.8(zod@4.3.6))(drizzle-kit@0.31.10)(jose@6.2.2)(kysely@0.28.16)(magicast@0.5.2)(nanostores@1.3.0)(postgres@3.4.9)(vitest@4.1.5)(vue@3.5.33(typescript@5.9.3))': + '@better-auth/cli@1.4.21(@better-fetch/fetch@1.1.21)(@electric-sql/pglite@0.4.4)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(drizzle-kit@0.31.10)(jose@6.2.2)(kysely@0.28.16)(magicast@0.5.2)(nanostores@1.3.0)(postgres@3.4.9)(vitest@4.1.5)(vue@3.5.33(typescript@5.9.3))': dependencies: '@babel/core': 7.29.0 '@babel/preset-react': 7.28.5(@babel/core@7.29.0) '@babel/preset-typescript': 7.28.5(@babel/core@7.29.0) - '@better-auth/core': 1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.1.8(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0) - '@better-auth/telemetry': 1.4.21(@better-auth/core@1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.1.8(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0)) + '@better-auth/core': 1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0) + '@better-auth/telemetry': 1.4.21(@better-auth/core@1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0)) '@better-auth/utils': 0.3.0 '@clack/prompts': 0.11.0 '@mrleebo/prisma-ast': 0.13.1 @@ -18954,6 +18954,17 @@ snapshots: nanostores: 1.3.0 zod: 4.3.6 + '@better-auth/core@1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0)': + dependencies: + '@better-auth/utils': 0.3.0 + '@better-fetch/fetch': 1.1.21 + '@standard-schema/spec': 1.1.0 + better-call: 1.3.5(zod@4.3.6) + jose: 6.2.2 + kysely: 0.28.16 + nanostores: 1.3.0 + zod: 4.3.6 + '@better-auth/core@1.6.6(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0)': dependencies: '@better-auth/utils': 0.4.0 @@ -18968,13 +18979,6 @@ snapshots: optionalDependencies: '@opentelemetry/api': 1.9.1 - '@better-auth/drizzle-adapter@1.6.6(@better-auth/core@1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.1.8(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0))(@better-auth/utils@0.3.0)(drizzle-orm@0.45.2(@electric-sql/pglite@0.4.4)(@opentelemetry/api@1.9.1)(@prisma/client@5.22.0)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)(postgres@3.4.9))': - dependencies: - '@better-auth/core': 1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.1.8(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0) - '@better-auth/utils': 0.3.0 - optionalDependencies: - drizzle-orm: 0.45.2(@electric-sql/pglite@0.4.4)(@opentelemetry/api@1.9.1)(@prisma/client@5.22.0)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)(postgres@3.4.9) - '@better-auth/drizzle-adapter@1.6.6(@better-auth/core@1.6.6(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0))(@better-auth/utils@0.4.0)(drizzle-orm@0.45.2(@electric-sql/pglite@0.4.4)(@opentelemetry/api@1.9.1)(@prisma/client@5.22.0)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)(postgres@3.4.9))': dependencies: '@better-auth/core': 1.6.6(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0) @@ -18999,13 +19003,13 @@ snapshots: '@better-auth/core': 1.6.6(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0) '@better-auth/utils': 0.4.0 - '@better-auth/oauth-provider@1.5.6(@better-auth/core@1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.1.8(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0))(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-auth@1.6.6(@opentelemetry/api@1.9.1)(@prisma/client@5.22.0)(better-sqlite3@12.9.0)(drizzle-kit@0.31.10)(drizzle-orm@0.45.2(@electric-sql/pglite@0.4.4)(@opentelemetry/api@1.9.1)(@prisma/client@5.22.0)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)(postgres@3.4.9))(pg@8.20.0)(vitest@4.1.5)(vue@3.5.33(typescript@5.9.3)))(better-call@1.1.8(zod@4.3.6))': + '@better-auth/oauth-provider@1.5.6(@better-auth/core@1.6.6(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0))(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(better-auth@1.6.6(@opentelemetry/api@1.9.1)(@prisma/client@5.22.0)(better-sqlite3@12.9.0)(drizzle-kit@0.31.10)(drizzle-orm@0.45.2(@electric-sql/pglite@0.4.4)(@opentelemetry/api@1.9.1)(@prisma/client@5.22.0)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)(postgres@3.4.9))(pg@8.20.0)(vitest@4.1.5)(vue@3.5.33(typescript@5.9.3)))(better-call@1.3.5(zod@4.3.6))': dependencies: - '@better-auth/core': 1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.1.8(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0) - '@better-auth/utils': 0.3.0 + '@better-auth/core': 1.6.6(@better-auth/utils@0.4.0)(@better-fetch/fetch@1.1.21)(@opentelemetry/api@1.9.1)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0) + '@better-auth/utils': 0.4.0 '@better-fetch/fetch': 1.1.21 better-auth: 1.6.6(@opentelemetry/api@1.9.1)(@prisma/client@5.22.0)(better-sqlite3@12.9.0)(drizzle-kit@0.31.10)(drizzle-orm@0.45.2(@electric-sql/pglite@0.4.4)(@opentelemetry/api@1.9.1)(@prisma/client@5.22.0)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)(postgres@3.4.9))(pg@8.20.0)(vitest@4.1.5)(vue@3.5.33(typescript@5.9.3)) - better-call: 1.1.8(zod@4.3.6) + better-call: 1.3.5(zod@4.3.6) jose: 6.2.2 zod: 4.3.6 @@ -19016,9 +19020,9 @@ snapshots: optionalDependencies: '@prisma/client': 5.22.0 - '@better-auth/telemetry@1.4.21(@better-auth/core@1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.1.8(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0))': + '@better-auth/telemetry@1.4.21(@better-auth/core@1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0))': dependencies: - '@better-auth/core': 1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.1.8(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0) + '@better-auth/core': 1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0) '@better-auth/utils': 0.3.0 '@better-fetch/fetch': 1.1.21 @@ -19341,9 +19345,9 @@ snapshots: dependencies: electron: 41.2.2 - '@electron-toolkit/tsconfig@2.0.0(@types/node@25.6.0)': + '@electron-toolkit/tsconfig@2.0.0(@types/node@24.12.2)': dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 '@electron-toolkit/utils@4.0.0(electron@41.2.2)': dependencies: @@ -20232,6 +20236,31 @@ snapshots: - supports-color - typescript + '@intlify/unplugin-vue-i18n@11.0.7(@vue/compiler-dom@3.5.33)(eslint@10.2.1(jiti@2.6.1))(rollup@4.60.2)(typescript@5.9.3)(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue-i18n@11.3.2(vue@3.5.33(typescript@5.9.3)))(vue@3.5.33(typescript@5.9.3))': + dependencies: + '@eslint-community/eslint-utils': 4.9.1(eslint@10.2.1(jiti@2.6.1)) + '@intlify/bundle-utils': 11.0.7(vue-i18n@11.3.2(vue@3.5.33(typescript@5.9.3))) + '@intlify/shared': 11.3.2 + '@intlify/vue-i18n-extensions': 8.0.0(@intlify/shared@11.3.2)(@vue/compiler-dom@3.5.33)(vue-i18n@11.3.2(vue@3.5.33(typescript@5.9.3)))(vue@3.5.33(typescript@5.9.3)) + '@rollup/pluginutils': 5.3.0(rollup@4.60.2) + '@typescript-eslint/scope-manager': 8.59.0 + '@typescript-eslint/typescript-estree': 8.59.0(typescript@5.9.3) + debug: 4.4.3(supports-color@10.2.2) + fast-glob: 3.3.3 + pathe: 2.0.3 + picocolors: 1.1.1 + unplugin: 2.3.11 + vite: 8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + vue: 3.5.33(typescript@5.9.3) + optionalDependencies: + vue-i18n: 11.3.2(vue@3.5.33(typescript@5.9.3)) + transitivePeerDependencies: + - '@vue/compiler-dom' + - eslint + - rollup + - supports-color + - typescript + '@intlify/unplugin-vue-i18n@11.0.7(@vue/compiler-dom@3.5.33)(eslint@10.2.1(jiti@2.6.1))(rollup@4.60.2)(typescript@5.9.3)(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue-i18n@11.3.2(vue@3.5.33(typescript@5.9.3)))(vue@3.5.33(typescript@5.9.3))': dependencies: '@eslint-community/eslint-utils': 4.9.1(eslint@10.2.1(jiti@2.6.1)) @@ -22199,11 +22228,35 @@ snapshots: transitivePeerDependencies: - magicast + '@proj-airi/unplugin-fetch@0.2.3(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))': + dependencies: + ofetch: 1.5.1 + vite: 8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + '@proj-airi/unplugin-fetch@0.2.3(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))': dependencies: ofetch: 1.5.1 vite: 8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + '@proj-airi/unplugin-live2d-sdk@0.1.7(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)': + dependencies: + ofetch: 1.5.1 + vite: 8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + yauzl: 3.3.0 + transitivePeerDependencies: + - '@types/node' + - '@vitejs/devtools' + - esbuild + - jiti + - less + - sass + - sass-embedded + - stylus + - sugarss + - terser + - tsx + - yaml + '@proj-airi/unplugin-live2d-sdk@0.1.7(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)': dependencies: ofetch: 1.5.1 @@ -22842,7 +22895,7 @@ snapshots: '@types/bunyan@1.8.11': dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 '@types/cacheable-request@6.0.3': dependencies: @@ -22862,11 +22915,11 @@ snapshots: '@types/connect@3.4.38': dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 '@types/cors@2.8.19': dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 '@types/culori@4.0.1': {} @@ -23022,11 +23075,11 @@ snapshots: '@types/fs-extra@8.1.5': dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 '@types/fs-extra@9.0.13': dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 '@types/geojson@7946.0.16': {} @@ -23067,7 +23120,7 @@ snapshots: '@types/memcached@2.2.10': dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 '@types/minimatch@3.0.5': {} @@ -23075,11 +23128,11 @@ snapshots: '@types/mysql@2.15.27': dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 '@types/node-rsa@1.1.4': dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 '@types/node@14.18.63': {} @@ -23100,6 +23153,7 @@ snapshots: '@types/node@25.6.0': dependencies: undici-types: 7.19.2 + optional: true '@types/nprogress@0.2.3': {} @@ -23107,7 +23161,7 @@ snapshots: '@types/oracledb@6.5.2': dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 '@types/parse-path@7.1.0': dependencies: @@ -23119,25 +23173,25 @@ snapshots: '@types/pg@8.15.6': dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 pg-protocol: 1.13.0 pg-types: 2.2.0 '@types/pg@8.20.0': dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 pg-protocol: 1.13.0 pg-types: 2.2.0 '@types/plist@3.0.5': dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 xmlbuilder: 15.1.1 optional: true '@types/readable-stream@4.0.23': dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 '@types/resolve@1.20.2': {} @@ -23157,7 +23211,7 @@ snapshots: '@types/tedious@4.0.14': dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 '@types/three@0.184.0': dependencies: @@ -23187,7 +23241,7 @@ snapshots: '@types/ws@8.18.1': dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 '@types/xast@2.0.4': dependencies: @@ -23195,7 +23249,7 @@ snapshots: '@types/yauzl@2.10.3': dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 '@typescript-eslint/eslint-plugin@8.59.0(@typescript-eslint/parser@8.59.0(eslint@10.2.1(jiti@2.6.1))(typescript@5.9.3))(eslint@10.2.1(jiti@2.6.1))(typescript@5.9.3)': dependencies: @@ -23614,6 +23668,12 @@ snapshots: vite: 6.4.2(@types/node@25.6.0)(jiti@2.6.1)(less@4.6.4)(lightningcss@1.32.0)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) vue: 3.5.33(typescript@5.9.3) + '@vitejs/plugin-vue@6.0.6(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue@3.5.33(typescript@5.9.3))': + dependencies: + '@rolldown/pluginutils': 1.0.0-rc.13 + vite: 8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + vue: 3.5.33(typescript@5.9.3) + '@vitejs/plugin-vue@6.0.6(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue@3.5.33(typescript@5.9.3))': dependencies: '@rolldown/pluginutils': 1.0.0-rc.13 @@ -23916,6 +23976,15 @@ snapshots: transitivePeerDependencies: - vue + '@vue-macros/devtools@3.1.2(typescript@5.9.3)(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))': + dependencies: + sirv: 3.0.2 + vue: 3.5.33(typescript@5.9.3) + optionalDependencies: + vite: 8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + transitivePeerDependencies: + - typescript + '@vue-macros/devtools@3.1.2(typescript@5.9.3)(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))': dependencies: sirv: 3.0.2 @@ -24829,7 +24898,7 @@ snapshots: better-auth@1.4.21(@prisma/client@5.22.0)(better-sqlite3@12.9.0)(drizzle-kit@0.31.10)(drizzle-orm@0.41.0(@electric-sql/pglite@0.4.4)(@opentelemetry/api@1.9.1)(@prisma/client@5.22.0)(@types/pg@8.20.0)(better-sqlite3@12.9.0)(kysely@0.28.16)(pg@8.20.0)(postgres@3.4.9))(pg@8.20.0)(vitest@4.1.5)(vue@3.5.33(typescript@5.9.3)): dependencies: '@better-auth/core': 1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.1.8(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0) - '@better-auth/telemetry': 1.4.21(@better-auth/core@1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.1.8(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0)) + '@better-auth/telemetry': 1.4.21(@better-auth/core@1.4.21(@better-auth/utils@0.3.0)(@better-fetch/fetch@1.1.21)(better-call@1.3.5(zod@4.3.6))(jose@6.2.2)(kysely@0.28.16)(nanostores@1.3.0)) '@better-auth/utils': 0.3.0 '@better-fetch/fetch': 1.1.21 '@noble/ciphers': 2.2.0 @@ -25200,7 +25269,7 @@ snapshots: chrome-launcher@1.2.0: dependencies: - '@types/node': 25.6.0 + '@types/node': 24.12.2 escape-string-regexp: 4.0.0 is-wsl: 2.2.0 lighthouse-logger: 2.0.2 @@ -26023,7 +26092,7 @@ snapshots: transitivePeerDependencies: - supports-color - electron-vite@5.0.0(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): + electron-vite@5.0.0(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): dependencies: '@babel/core': 7.29.0 '@babel/plugin-transform-arrow-functions': 7.27.1(@babel/core@7.29.0) @@ -26031,7 +26100,7 @@ snapshots: esbuild: 0.25.12 magic-string: 0.30.21 picocolors: 1.1.1 - vite: 8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + vite: 8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) transitivePeerDependencies: - supports-color @@ -26122,7 +26191,7 @@ snapshots: engine.io@6.6.6(bufferutil@4.1.0)(utf-8-validate@5.0.10): dependencies: '@types/cors': 2.8.19 - '@types/node': 25.6.0 + '@types/node': 24.12.2 '@types/ws': 8.18.1 accepts: 1.3.8 base64id: 2.0.0 @@ -26806,8 +26875,7 @@ snapshots: extsprintf@1.3.0: {} - extsprintf@1.4.1: - optional: true + extsprintf@1.4.1: {} fast-deep-equal@3.1.3: {} @@ -28977,7 +29045,7 @@ snapshots: debug: 4.4.3(supports-color@10.2.2) endian-toggle: 0.0.0 lodash.merge: 4.6.2 - minecraft-data: 3.109.0 + minecraft-data: 3.102.3 minecraft-folder-path: 1.2.0 node-fetch: 2.7.0 node-rsa: 0.4.2 @@ -30122,7 +30190,7 @@ snapshots: prismarine-registry@1.12.0: dependencies: - minecraft-data: 3.109.0 + minecraft-data: 3.102.3 prismarine-block: 1.23.0 prismarine-nbt: 2.8.0 transitivePeerDependencies: @@ -30207,7 +30275,7 @@ snapshots: '@protobufjs/path': 1.1.2 '@protobufjs/pool': 1.1.0 '@protobufjs/utf8': 1.1.0 - '@types/node': 25.6.0 + '@types/node': 24.12.2 long: 5.3.2 protobufjs@8.0.1: @@ -30222,7 +30290,7 @@ snapshots: '@protobufjs/path': 1.1.2 '@protobufjs/pool': 1.1.0 '@protobufjs/utf8': 1.1.0 - '@types/node': 25.6.0 + '@types/node': 24.12.2 long: 5.3.2 protocols@2.0.2: {} @@ -31786,7 +31854,8 @@ snapshots: undici-types@7.16.0: {} - undici-types@7.19.2: {} + undici-types@7.19.2: + optional: true undici@6.24.1: {} @@ -31890,11 +31959,6 @@ snapshots: '@unocss/preset-mini': 66.6.8 unocss: 66.6.8(@emnapi/core@1.9.2)(@emnapi/runtime@1.10.0)(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) - unocss-preset-scrollbar@4.0.0(unocss@66.6.8(@emnapi/core@1.9.2)(@emnapi/runtime@1.10.0)(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))): - dependencies: - '@unocss/preset-mini': 66.6.8 - unocss: 66.6.8(@emnapi/core@1.9.2)(@emnapi/runtime@1.10.0)(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) - unocss@66.6.8(@emnapi/core@1.9.2)(@emnapi/runtime@1.10.0)(vite@6.4.2(@types/node@25.6.0)(jiti@2.6.1)(less@4.6.4)(lightningcss@1.32.0)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): dependencies: '@unocss/cli': 66.6.8 @@ -31979,6 +32043,14 @@ snapshots: unplugin: 2.3.11 vite: 8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + unplugin-combine@2.3.0(esbuild@0.27.7)(rolldown@1.0.0-rc.16)(rollup@4.60.2)(unplugin@2.3.11)(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): + optionalDependencies: + esbuild: 0.27.7 + rolldown: 1.0.0-rc.16 + rollup: 4.60.2 + unplugin: 2.3.11 + vite: 8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + unplugin-combine@2.3.0(esbuild@0.27.7)(rolldown@1.0.0-rc.16)(rollup@4.60.2)(unplugin@2.3.11)(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): optionalDependencies: esbuild: 0.27.7 @@ -32013,6 +32085,19 @@ snapshots: transitivePeerDependencies: - supports-color + unplugin-info@1.3.2(esbuild@0.27.7)(rollup@4.60.2)(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): + dependencies: + ci-info: 4.4.0 + git-url-parse: 16.1.0 + simple-git: 3.36.0 + unplugin: 2.3.11 + optionalDependencies: + esbuild: 0.27.7 + rollup: 4.60.2 + vite: 8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + transitivePeerDependencies: + - supports-color + unplugin-info@1.3.2(esbuild@0.27.7)(rollup@4.60.2)(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): dependencies: ci-info: 4.4.0 @@ -32122,6 +32207,17 @@ snapshots: rollup: 4.60.2 vite: 6.4.2(@types/node@25.6.0)(jiti@2.6.1)(less@4.6.4)(lightningcss@1.32.0)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + unplugin-yaml@4.1.0(esbuild@0.27.7)(rolldown@1.0.0-rc.16)(rollup@4.60.2)(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): + dependencies: + '@rollup/pluginutils': 5.3.0(rollup@4.60.2) + unplugin: 3.0.0 + yaml: 2.8.3 + optionalDependencies: + esbuild: 0.27.7 + rolldown: 1.0.0-rc.16 + rollup: 4.60.2 + vite: 8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + unplugin@2.3.11: dependencies: '@jridgewell/remapping': 2.3.5 @@ -32261,7 +32357,7 @@ snapshots: dependencies: assert-plus: 1.0.0 core-util-is: 1.0.2 - extsprintf: 1.3.0 + extsprintf: 1.4.1 verror@1.10.1: dependencies: @@ -32352,12 +32448,22 @@ snapshots: - rollup - supports-color + vite-dev-rpc@1.1.0(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): + dependencies: + birpc: 2.9.0 + vite: 8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + vite-hot-client: 2.1.0(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) + vite-dev-rpc@1.1.0(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): dependencies: birpc: 2.9.0 vite: 8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) vite-hot-client: 2.1.0(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) + vite-hot-client@2.1.0(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): + dependencies: + vite: 8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + vite-hot-client@2.1.0(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): dependencies: vite: 8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) @@ -32404,6 +32510,21 @@ snapshots: - tsx - yaml + vite-plugin-inspect@11.3.3(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): + dependencies: + ansis: 4.2.0 + debug: 4.4.3(supports-color@10.2.2) + error-stack-parser-es: 1.0.5 + ohash: 2.0.11 + open: 10.2.0 + perfect-debounce: 2.1.0 + sirv: 3.0.2 + unplugin-utils: 0.3.1 + vite: 8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + vite-dev-rpc: 1.1.0(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) + transitivePeerDependencies: + - supports-color + vite-plugin-inspect@11.3.3(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): dependencies: ansis: 4.2.0 @@ -32436,6 +32557,13 @@ snapshots: - typescript - utf-8-validate + vite-plugin-mkcert@2.0.0(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): + dependencies: + debug: 4.4.3(supports-color@10.2.2) + supports-color: 10.2.2 + undici: 8.1.0 + vite: 8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + vite-plugin-mkcert@2.0.0(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): dependencies: debug: 4.4.3(supports-color@10.2.2) @@ -32454,6 +32582,20 @@ snapshots: transitivePeerDependencies: - supports-color + vite-plugin-vue-devtools@8.1.1(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue@3.5.33(typescript@5.9.3)): + dependencies: + '@vue/devtools-core': 8.1.1(vue@3.5.33(typescript@5.9.3)) + '@vue/devtools-kit': 8.1.1 + '@vue/devtools-shared': 8.1.1 + sirv: 3.0.2 + vite: 8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + vite-plugin-inspect: 11.3.3(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) + vite-plugin-vue-inspector: 5.4.0(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) + transitivePeerDependencies: + - '@nuxt/kit' + - supports-color + - vue + vite-plugin-vue-devtools@8.1.1(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue@3.5.33(typescript@5.9.3)): dependencies: '@vue/devtools-core': 8.1.1(vue@3.5.33(typescript@5.9.3)) @@ -32468,6 +32610,21 @@ snapshots: - supports-color - vue + vite-plugin-vue-inspector@5.4.0(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): + dependencies: + '@babel/core': 7.29.0 + '@babel/plugin-proposal-decorators': 7.29.0(@babel/core@7.29.0) + '@babel/plugin-syntax-import-attributes': 7.28.6(@babel/core@7.29.0) + '@babel/plugin-syntax-import-meta': 7.10.4(@babel/core@7.29.0) + '@babel/plugin-transform-typescript': 7.28.6(@babel/core@7.29.0) + '@vue/babel-plugin-jsx': 1.5.0(@babel/core@7.29.0) + '@vue/compiler-dom': 3.5.33 + kolorist: 1.8.0 + magic-string: 0.30.21 + vite: 8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + transitivePeerDependencies: + - supports-color + vite-plugin-vue-inspector@5.4.0(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)): dependencies: '@babel/core': 7.29.0 @@ -32483,6 +32640,16 @@ snapshots: transitivePeerDependencies: - supports-color + vite-plugin-vue-layouts@0.11.0(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue-router@5.0.5(@vue/compiler-sfc@3.5.33)(pinia@3.0.4(typescript@5.9.3)(vue@3.5.33(typescript@5.9.3)))(vue@3.5.33(typescript@5.9.3)))(vue@3.5.33(typescript@5.9.3)): + dependencies: + debug: 4.4.3(supports-color@10.2.2) + fast-glob: 3.3.3 + vite: 8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3) + vue: 3.5.33(typescript@5.9.3) + vue-router: 5.0.5(@vue/compiler-sfc@3.5.33)(pinia@3.0.4(typescript@5.9.3)(vue@3.5.33(typescript@5.9.3)))(vue@3.5.33(typescript@5.9.3)) + transitivePeerDependencies: + - supports-color + vite-plugin-vue-layouts@0.11.0(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue-router@5.0.5(@vue/compiler-sfc@3.5.33)(pinia@3.0.4(typescript@5.9.3)(vue@3.5.33(typescript@5.9.3)))(vue@3.5.33(typescript@5.9.3)))(vue@3.5.33(typescript@5.9.3)): dependencies: debug: 4.4.3(supports-color@10.2.2) @@ -32731,6 +32898,54 @@ snapshots: - vue-tsc - webpack + vue-macros@3.1.2(@emnapi/core@1.9.2)(@emnapi/runtime@1.10.0)(@vueuse/core@14.2.1(vue@3.5.33(typescript@5.9.3)))(esbuild@0.27.7)(rolldown@1.0.0-rc.16)(rollup@4.60.2)(typescript@5.9.3)(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue-tsc@3.2.7(typescript@5.9.3))(vue@3.5.33(typescript@5.9.3)): + dependencies: + '@vue-macros/better-define': 3.1.2(@emnapi/core@1.9.2)(@emnapi/runtime@1.10.0)(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/boolean-prop': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/chain-call': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/common': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/config': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/define-emit': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/define-models': 3.1.2(@vueuse/core@14.2.1(vue@3.5.33(typescript@5.9.3)))(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/define-prop': 3.1.2(@emnapi/core@1.9.2)(@emnapi/runtime@1.10.0)(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/define-props': 3.1.2(@vue-macros/reactivity-transform@3.1.2(vue@3.5.33(typescript@5.9.3)))(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/define-props-refs': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/define-render': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/define-slots': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/define-stylex': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/devtools': 3.1.2(typescript@5.9.3)(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) + '@vue-macros/export-expose': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/export-props': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/export-render': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/hoist-static': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/jsx-directive': 3.1.2(typescript@5.9.3) + '@vue-macros/named-template': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/reactivity-transform': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/script-lang': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/setup-block': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/setup-component': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/setup-sfc': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/short-bind': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/short-emits': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/short-vmodel': 3.1.2(vue@3.5.33(typescript@5.9.3)) + '@vue-macros/volar': 3.1.2(typescript@5.9.3)(vue-tsc@3.2.7(typescript@5.9.3))(vue@3.5.33(typescript@5.9.3)) + unplugin: 2.3.11 + unplugin-combine: 2.3.0(esbuild@0.27.7)(rolldown@1.0.0-rc.16)(rollup@4.60.2)(unplugin@2.3.11)(vite@8.0.9(@types/node@24.12.2)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3)) + unplugin-vue-define-options: 3.1.2(vue@3.5.33(typescript@5.9.3)) + vue: 3.5.33(typescript@5.9.3) + transitivePeerDependencies: + - '@emnapi/core' + - '@emnapi/runtime' + - '@rspack/core' + - '@vueuse/core' + - esbuild + - rolldown + - rollup + - typescript + - vite + - vue-tsc + - webpack + vue-macros@3.1.2(@emnapi/core@1.9.2)(@emnapi/runtime@1.10.0)(@vueuse/core@14.2.1(vue@3.5.33(typescript@5.9.3)))(esbuild@0.27.7)(rolldown@1.0.0-rc.16)(rollup@4.60.2)(typescript@5.9.3)(vite@8.0.9(@types/node@25.6.0)(esbuild@0.27.7)(jiti@2.6.1)(less@4.6.4)(terser@5.46.1)(tsx@4.21.0)(yaml@2.8.3))(vue-tsc@3.2.7(typescript@5.9.3))(vue@3.5.33(typescript@5.9.3)): dependencies: '@vue-macros/better-define': 3.1.2(@emnapi/core@1.9.2)(@emnapi/runtime@1.10.0)(vue@3.5.33(typescript@5.9.3)) diff --git a/services/computer-use-mcp/README.md b/services/computer-use-mcp/README.md index 8cbb675a63..ab70994ffe 100644 --- a/services/computer-use-mcp/README.md +++ b/services/computer-use-mcp/README.md @@ -268,9 +268,14 @@ On the AIRI desktop side, approvals are handled like this: For browser DOM automation, `computer-use-mcp` also exposes a local WebSocket bridge that matches the user's Chrome extension bridge pattern: 1. `computer-use-mcp` listens on `ws://127.0.0.1:8765` by default -2. the unpacked browser extension connects from its offscreen document +2. the unpacked browser extension background service worker connects to that socket 3. AIRI can then call `browser_dom_*` MCP tools against the active browser tab +If you override `COMPUTER_USE_BROWSER_DOM_BRIDGE_HOST` or +`COMPUTER_USE_BROWSER_DOM_BRIDGE_PORT`, mirror the same endpoint in the Chrome +extension via `chrome.storage.local.set({ browserDomBridgeHost, browserDomBridgePort })` +so the background worker reconnects to the correct socket. + Use the two surfaces differently: - `desktop_*` for AIRI itself, native macOS apps, Electron windows, Finder, Terminal, VS Code diff --git a/services/computer-use-mcp/chrome-extension/README.md b/services/computer-use-mcp/chrome-extension/README.md new file mode 100644 index 0000000000..dfe332750b --- /dev/null +++ b/services/computer-use-mcp/chrome-extension/README.md @@ -0,0 +1,69 @@ +# AIRI Desktop Grounding — Chrome Extension + +Read-only Chrome DOM observation bridge for the AIRI Desktop Grounding layer. + +## What it does + +- Collects interactive elements (buttons, links, inputs, etc.) from all frames in the active Chrome tab +- Reports element positions, ARIA roles, text, and rect coordinates +- Feeds this data into the desktop grounding snap resolver for coordinate mapping + +## What it does NOT do + +- ❌ No DOM mutations (no clicking, typing, scrolling on DOM elements) +- ❌ No `eval` / `new Function` / `chrome.scripting.executeScript` +- ❌ No external network requests (no Python bridge, no offscreen documents) +- ❌ No popup UI + +All user interactions are performed via real macOS OS-level input events (CGEvent) through the desktop grounding executor. + +## Architecture + +``` +background.js (Service Worker) + ↕ chrome.tabs.sendMessage +msg_bridge.js (ISOLATED world) + ↕ window.postMessage +content.js (MAIN world, window.__AIRI_DG__) +``` + +## Installation (development) + +1. Open `chrome://extensions/` +2. Enable "Developer mode" +3. Click "Load unpacked" +4. Select this `chrome-extension/` directory +5. The extension will auto-inject into all pages + +## Bridge endpoint override + +By default the background worker connects to `ws://127.0.0.1:8765`. + +If `computer-use-mcp` is running with a non-default +`COMPUTER_USE_BROWSER_DOM_BRIDGE_HOST` or `COMPUTER_USE_BROWSER_DOM_BRIDGE_PORT`, +override the extension endpoint through `chrome.storage.local`: + +```js +await chrome.storage.local.set({ + browserDomBridgeHost: '127.0.0.1', + browserDomBridgePort: 8876, +}) +``` + +The service worker watches these keys and reconnects automatically. + +## Supported commands + +| Command | Description | +|---------|-------------| +| `getActiveTab` | Get active tab info (id, url, title) | +| `getAllFrames` | List all frames in active tab | +| `readAllFramesDOM` | Collect interactive elements from all frames | +| `findElement` | Find single element by CSS selector | +| `findElements` | Find multiple elements by CSS selector | +| `getClickTarget` | Get element center point for click targeting | +| `getElementAttributes` | Get all attributes of an element | + +## Provenance + +Adapted from `/Users/liuziheng/computer_use/chrome-extension/` with DOM-action methods stripped. diff --git a/services/computer-use-mcp/chrome-extension/background.js b/services/computer-use-mcp/chrome-extension/background.js new file mode 100644 index 0000000000..483a9b824d --- /dev/null +++ b/services/computer-use-mcp/chrome-extension/background.js @@ -0,0 +1,368 @@ +/** + * background.js — MV3 Service Worker for AIRI Desktop Grounding + * + * Routes commands from the AIRI extension bridge → chrome.tabs.sendMessage + * → msg_bridge.js → content.js (__AIRI_DG__) + * + * IMPORTANT: This background does NOT use offscreen documents or Python bridges. + * It receives commands directly from the existing BrowserDomExtensionBridge + * WebSocket connection in the AIRI computer-use-mcp service. + * + * Only read-only observation commands are supported. + * All DOM-mutating actions (click, type, hover, scroll) have been removed + * because the desktop lane uses real macOS OS-level input events. + * + * Adapted from /Users/liuziheng/computer_use/chrome-extension/background.js. + * Stripped: offscreen management, Python bridge, all DOM-action commands + * (clickAt, typeAt, hoverAt, scrollAt, simulateDragDrop, readStorage, + * setStorage, readCanvasData, injectCSS, executeScript, etc.) + */ + +// ---- Bridge connection ---- + +const DEFAULT_BRIDGE_HOST = '127.0.0.1' +const DEFAULT_BRIDGE_PORT = 8765 +const BRIDGE_RECONNECT_DELAY_MS = 1000 +const BRIDGE_HOST_STORAGE_KEY = 'browserDomBridgeHost' +const BRIDGE_PORT_STORAGE_KEY = 'browserDomBridgePort' + +let bridgeSocket = null +let reconnectTimer = null +let connecting = false +let bridgeHost = DEFAULT_BRIDGE_HOST +let bridgePort = DEFAULT_BRIDGE_PORT + +function clearReconnectTimer() { + if (reconnectTimer !== null) { + clearTimeout(reconnectTimer) + reconnectTimer = null + } +} + +function scheduleReconnect(delayMs = BRIDGE_RECONNECT_DELAY_MS) { + if (reconnectTimer !== null) + return + + reconnectTimer = setTimeout(() => { + reconnectTimer = null + connectBridge().catch(() => {}) + }, delayMs) +} + +function sendBridgeMessage(payload) { + if (!bridgeSocket || bridgeSocket.readyState !== WebSocket.OPEN) + return false + + bridgeSocket.send(JSON.stringify(payload)) + return true +} + +function normalizeBridgeHost(value) { + return typeof value === 'string' && value.trim() ? value.trim() : DEFAULT_BRIDGE_HOST +} + +function normalizeBridgePort(value) { + if (typeof value === 'number' && Number.isInteger(value) && value > 0) + return value + + if (typeof value === 'string' && value.trim()) { + const parsed = Number.parseInt(value.trim(), 10) + if (Number.isInteger(parsed) && parsed > 0) + return parsed + } + + return DEFAULT_BRIDGE_PORT +} + +async function loadBridgeConfig() { + try { + const stored = await chrome.storage.local.get([ + BRIDGE_HOST_STORAGE_KEY, + BRIDGE_PORT_STORAGE_KEY, + ]) + bridgeHost = normalizeBridgeHost(stored[BRIDGE_HOST_STORAGE_KEY]) + bridgePort = normalizeBridgePort(stored[BRIDGE_PORT_STORAGE_KEY]) + } + catch { + bridgeHost = DEFAULT_BRIDGE_HOST + bridgePort = DEFAULT_BRIDGE_PORT + } +} + +async function saveBridgeConfig(host, port) { + await chrome.storage.local.set({ + [BRIDGE_HOST_STORAGE_KEY]: normalizeBridgeHost(host), + [BRIDGE_PORT_STORAGE_KEY]: normalizeBridgePort(port), + }) + await loadBridgeConfig() +} + +async function handleBridgeMessage(raw) { + let data + try { + data = JSON.parse(String(raw)) + } + catch { + return + } + + const response = await handleCommand(data) + sendBridgeMessage(response) +} + +async function connectBridge() { + if (connecting) + return + if (bridgeSocket && (bridgeSocket.readyState === WebSocket.OPEN || bridgeSocket.readyState === WebSocket.CONNECTING)) + return + + connecting = true + try { + await loadBridgeConfig() + const socket = new WebSocket(`ws://${bridgeHost}:${bridgePort}`) + bridgeSocket = socket + + socket.addEventListener('open', () => { + connecting = false + clearReconnectTimer() + sendBridgeMessage({ + type: 'hello', + source: 'airi-desktop-grounding-extension', + version: chrome.runtime.getManifest().version, + }) + }) + + socket.addEventListener('message', (event) => { + void handleBridgeMessage(event.data) + }) + + socket.addEventListener('close', () => { + if (bridgeSocket === socket) { + bridgeSocket = null + } + connecting = false + scheduleReconnect() + }) + + socket.addEventListener('error', () => { + connecting = false + try { + socket.close() + } + catch {} + }) + } + catch { + connecting = false + scheduleReconnect() + } +} + +function reconnectBridgeNow() { + clearReconnectTimer() + if (bridgeSocket) { + try { + bridgeSocket.close() + } + catch {} + bridgeSocket = null + } + connecting = false + void connectBridge() +} + +// ---- Tab / Frame utilities ---- + +async function getActiveTab() { + const tabs = await chrome.tabs.query({ active: true, lastFocusedWindow: true }) + return tabs[0] || null +} + +// ---- Core: route commands to content.js via msg_bridge.js ---- + +/** + * Send a CU_ACTION message to a specific tab + frame. + * msg_bridge.js (ISOLATED world) receives → postMessage → content.js (MAIN world) + */ +async function sendCUAction(tabId, frameId, method, args) { + return new Promise((resolve) => { + const timeout = setTimeout(() => { + resolve({ success: false, error: 'sendMessage timeout' }) + }, 8000) + + try { + chrome.tabs.sendMessage( + tabId, + { type: 'CU_ACTION', method, args: args || [] }, + { frameId }, + (response) => { + clearTimeout(timeout) + if (chrome.runtime.lastError) { + resolve({ success: false, error: chrome.runtime.lastError.message }) + } + else { + resolve(response || { success: false, error: 'no response' }) + } + }, + ) + } + catch (e) { + clearTimeout(timeout) + resolve({ success: false, error: e.message || String(e) }) + } + }) +} + +/** + * Run a CU_ACTION across all frames (or specified frames) in a tab. + * Returns [{frameId, result}] + */ +async function runCUAction(tabId, frameIds, method, args) { + let targets = frameIds + if (!targets || (Array.isArray(targets) && targets.length === 0)) { + const frames = await chrome.webNavigation.getAllFrames({ tabId }) + targets = frames.map(f => f.frameId) + } + else if (!Array.isArray(targets)) { + targets = [targets] + } + + return Promise.all( + targets.map(async (fid) => { + const result = await sendCUAction(tabId, fid, method, args) + return { frameId: fid, result } + }), + ) +} + +// ---- Handle external commands (from AIRI extension bridge) ---- + +/** + * Handle a command from the AIRI BrowserDomExtensionBridge. + * + * Only read-only observation commands are supported: + * - getActiveTab: get the active tab info + * - getAllFrames: list all frames in the active tab + * - readAllFramesDOM: collect interactive elements from all frames + * - findElement: find a single element by CSS selector + * - findElements: find multiple elements by CSS selector + * - getClickTarget: get center point of an element for click targeting + * - getElementAttributes: get all attributes of an element + */ +async function handleCommand(cmd) { + const { action, id } = cmd + try { + let result + const tab = await getActiveTab() + const tabId = cmd.tabId || (tab && tab.id) + + if (!tabId && action !== 'getActiveTab') { + return { id, ok: false, error: 'no active tab' } + } + + switch (action) { + case 'getActiveTab': + result = tab ? { id: tab.id, url: tab.url, title: tab.title } : null + break + + case 'getAllFrames': + result = await chrome.webNavigation.getAllFrames({ tabId }) + break + + case 'readAllFramesDOM': + result = await runCUAction(tabId, cmd.frameIds || null, 'collectFrameDOM', [cmd.opts || {}]) + break + + case 'findElement': + result = await runCUAction(tabId, cmd.frameIds || null, 'findElement', [cmd.selector || '']) + break + + case 'findElements': + result = await runCUAction(tabId, cmd.frameIds || null, 'findElements', [cmd.selector || '', cmd.max || 10]) + break + + case 'getClickTarget': + result = await runCUAction(tabId, cmd.frameIds || null, 'getClickTarget', [cmd.selector || '']) + break + + case 'getElementAttributes': + result = await runCUAction(tabId, cmd.frameIds || null, 'getElementAttributes', [cmd.selector || '']) + break + + default: + return { id, ok: false, error: `unknown action: ${action}` } + } + + return { id, ok: true, result } + } + catch (e) { + return { id, ok: false, error: e.message || String(e) } + } +} + +// ---- Listen for external messages ---- +// The AIRI BrowserDomExtensionBridge connects via chrome.runtime.onMessageExternal +// or through the existing WebSocket bridge mechanism + +chrome.runtime.onMessage.addListener((msg, sender, sendResponse) => { + void connectBridge() + + if (msg.type === 'AIRI_DG_SET_BRIDGE_ENDPOINT') { + saveBridgeConfig(msg.host, msg.port) + .then(() => { + reconnectBridgeNow() + sendResponse({ + ok: true, + host: bridgeHost, + port: bridgePort, + }) + }) + .catch((e) => { + sendResponse({ ok: false, error: e?.message || String(e) }) + }) + return true + } + + if (msg.type === 'AIRI_DG_COMMAND') { + handleCommand(msg.data) + .then(resp => sendResponse(resp)) + .catch(e => sendResponse({ ok: false, error: String(e) })) + return true // Keep sendResponse async + } + + // Support the existing ws-incoming format from BrowserDomExtensionBridge + if (msg.type === 'ws-incoming') { + handleCommand(msg.data) + .then((resp) => { + // Send response back via the same channel + chrome.runtime.sendMessage({ type: 'ws-send', data: resp }) + }) + .catch((e) => { + chrome.runtime.sendMessage({ type: 'ws-send', data: { id: msg.data?.id, ok: false, error: String(e) } }) + }) + return false + } + + return false +}) + +chrome.storage.onChanged.addListener((changes, areaName) => { + if (areaName !== 'local') + return + + if (changes[BRIDGE_HOST_STORAGE_KEY] || changes[BRIDGE_PORT_STORAGE_KEY]) { + void loadBridgeConfig().finally(() => { + reconnectBridgeNow() + }) + } +}) + +chrome.runtime.onStartup.addListener(() => { + void connectBridge() +}) + +chrome.runtime.onInstalled.addListener(() => { + void connectBridge() +}) + +void connectBridge() diff --git a/services/computer-use-mcp/chrome-extension/content.js b/services/computer-use-mcp/chrome-extension/content.js new file mode 100644 index 0000000000..cdd5efc6cf --- /dev/null +++ b/services/computer-use-mcp/chrome-extension/content.js @@ -0,0 +1,228 @@ +/** + * content.js — AIRI Desktop Grounding: read-only DOM observation + * + * Injected into every frame (including cross-origin iframes) in the MAIN world. + * Namespace: window.__AIRI_DG__ + * + * IMPORTANT: This script is READ-ONLY. It does NOT perform any DOM mutations, + * clicks, typing, or navigation. All execution is done via real macOS OS-level + * input events through the desktop grounding executor. + * + * Adapted from /Users/liuziheng/computer_use/chrome-extension/content.js. + * Stripped: clickAt, typeAt, hoverAt, scrollAt, simulateDragDrop, readStorage, + * setStorage, readCanvasData, injectCSS, and all other DOM-mutating methods. + * Kept: collectFrameDOM, _describeElement, _collectInteractiveElements, + * findElement, findElements, getClickTarget. + */ +(function () { + 'use strict' + if (window.__AIRI_DG__) + return // Prevent re-entry + + const MAX_INTERACTIVE = 200 + + // ---- Element description ---- + + /** + * Describe a single DOM element with its tag, attributes, text, rect. + * Returns null for non-element nodes or invisible elements. + */ + function _describeElement(el) { + if (!el || el.nodeType !== 1) + return null + const r = el.getBoundingClientRect() + return { + tag: el.tagName.toLowerCase(), + id: el.id || '', + name: el.name || '', + type: el.type || '', + className: typeof el.className === 'string' ? el.className.slice(0, 120) : '', + text: (el.innerText || el.textContent || '').slice(0, 120).trim(), + value: el.value !== undefined ? String(el.value).slice(0, 60) : '', + href: el.href || '', + placeholder: el.placeholder || '', + role: el.getAttribute('role') || '', + disabled: !!el.disabled, + checked: !!el.checked, + visible: r.width > 0 && r.height > 0, + rect: { x: Math.round(r.left), y: Math.round(r.top), w: Math.round(r.width), h: Math.round(r.height) }, + } + } + + /** + * Collect visible interactive elements from the current frame. + * Targets: links, buttons, inputs, textareas, selects, and elements with + * interactive ARIA roles or tabindex. + */ + function _collectInteractiveElements(maxCount) { + const n = maxCount || MAX_INTERACTIVE + const selectors = 'a,button,input,textarea,select,[role="button"],[role="link"],[role="tab"],[role="menuitem"],[role="checkbox"],[role="radio"],[onclick],[tabindex]' + const nodes = document.querySelectorAll(selectors) + const els = [] + for (let i = 0; i < nodes.length && els.length < n; i++) { + const d = _describeElement(nodes[i]) + if (d && d.visible) + els.push(d) + } + return els + } + + /** + * Get this frame's embedding rect relative to its parent viewport. + * + * NOTICE: Cross-origin frames may not expose `window.frameElement`. + * In that case we return null and let the adapter skip those frame-local + * coordinates rather than projecting them incorrectly onto the desktop. + */ + function _getFrameRect() { + try { + if (window.top === window) + return null + + const frameEl = window.frameElement + if (!(frameEl instanceof Element)) + return null + + const r = frameEl.getBoundingClientRect() + return { + x: Math.round(r.left), + y: Math.round(r.top), + w: Math.round(r.width), + h: Math.round(r.height), + } + } + catch { + return null + } + } + + // ---- Core API (read-only) ---- + + const __AIRI_DG__ = { + version: '1.0-airi-dg', + + /** + * Collect the DOM structure of the current frame. + * Returns URL, title, body text (optional), and interactive elements. + */ + collectFrameDOM(opts) { + opts = opts || {} + const includeText = opts.includeText !== false + const maxElements = opts.maxElements || MAX_INTERACTIVE + return { + url: location.href, + title: document.title || '', + bodyText: includeText ? (document.body ? document.body.innerText || '' : '').slice(0, 3000) : '', + frameRect: _getFrameRect() || undefined, + interactiveElements: _collectInteractiveElements(maxElements), + } + }, + + /** + * Find a single element by CSS selector and describe it. + */ + findElement(selector) { + try { + const el = document.querySelector(selector) + if (!el) + return { success: false, error: 'not found' } + return { success: true, element: _describeElement(el) } + } + catch (e) { + return { success: false, error: e.message } + } + }, + + /** + * Find multiple elements by CSS selector and describe them. + */ + findElements(selector, max) { + try { + const nodes = document.querySelectorAll(selector) + const results = [] + const limit = max || 10 + for (let i = 0; i < nodes.length && results.length < limit; i++) { + const d = _describeElement(nodes[i]) + if (d) + results.push(d) + } + return { success: true, elements: results } + } + catch (e) { + return { success: false, error: e.message } + } + }, + + /** + * Get the center point of an element for click targeting. + * Returns the element description with center coordinates. + */ + getClickTarget(selector) { + try { + const el = document.querySelector(selector) + if (!el) + return { success: false, error: 'not found' } + const r = el.getBoundingClientRect() + return { + success: true, + element: _describeElement(el), + center: { + x: Math.round(r.left + r.width / 2), + y: Math.round(r.top + r.height / 2), + }, + } + } + catch (e) { + return { success: false, error: e.message } + } + }, + + /** + * Get element attributes for debugging. + */ + getElementAttributes(selector) { + try { + const el = document.querySelector(selector) + if (!el) + return { success: false, error: 'not found' } + const attrs = {} + for (const attr of el.attributes) { + attrs[attr.name] = attr.value + } + return { success: true, attributes: attrs } + } + catch (e) { + return { success: false, error: e.message } + } + }, + } + + window.__AIRI_DG__ = __AIRI_DG__ + + // ---- Message handler: ISOLATED world bridge → MAIN world ---- + window.addEventListener('message', (evt) => { + if (evt.source !== window) + return + const data = evt.data + if (!data || data.type !== '__CU_CALL__') + return + + const { reqId, method, args } = data + const fn = __AIRI_DG__[method] + let result + + if (typeof fn === 'function') { + try { + result = { success: true, data: fn.apply(__AIRI_DG__, args || []) } + } + catch (e) { + result = { success: false, error: e.message || String(e) } + } + } + else { + result = { success: false, error: `unknown method: ${method}` } + } + + window.postMessage({ type: '__CU_REPLY__', reqId, result }, '*') + }) +})() diff --git a/services/computer-use-mcp/chrome-extension/icon128.png b/services/computer-use-mcp/chrome-extension/icon128.png new file mode 100644 index 0000000000..fa700c71ed Binary files /dev/null and b/services/computer-use-mcp/chrome-extension/icon128.png differ diff --git a/services/computer-use-mcp/chrome-extension/icon16.png b/services/computer-use-mcp/chrome-extension/icon16.png new file mode 100644 index 0000000000..66abb1f97b Binary files /dev/null and b/services/computer-use-mcp/chrome-extension/icon16.png differ diff --git a/services/computer-use-mcp/chrome-extension/icon48.png b/services/computer-use-mcp/chrome-extension/icon48.png new file mode 100644 index 0000000000..b64648fd6a Binary files /dev/null and b/services/computer-use-mcp/chrome-extension/icon48.png differ diff --git a/services/computer-use-mcp/chrome-extension/manifest.json b/services/computer-use-mcp/chrome-extension/manifest.json new file mode 100644 index 0000000000..ec1f6c0977 --- /dev/null +++ b/services/computer-use-mcp/chrome-extension/manifest.json @@ -0,0 +1,43 @@ +{ + "manifest_version": 3, + "name": "AIRI Desktop Grounding Bridge", + "version": "1.0.0", + "description": "Read-only DOM observation bridge for AIRI desktop grounding (no DOM mutation, no eval)", + + "permissions": [ + "activeTab", + "tabs", + "webNavigation", + "storage" + ], + "host_permissions": [""], + + "background": { + "service_worker": "background.js" + }, + + "content_scripts": [ + { + "matches": [""], + "js": ["content.js"], + "run_at": "document_idle", + "all_frames": true, + "match_about_blank": true, + "world": "MAIN" + }, + { + "matches": [""], + "js": ["msg_bridge.js"], + "run_at": "document_idle", + "all_frames": true, + "match_about_blank": true, + "world": "ISOLATED" + } + ], + + "icons": { + "16": "icon16.png", + "48": "icon48.png", + "128": "icon128.png" + } +} diff --git a/services/computer-use-mcp/chrome-extension/msg_bridge.js b/services/computer-use-mcp/chrome-extension/msg_bridge.js new file mode 100644 index 0000000000..f747e3eac9 --- /dev/null +++ b/services/computer-use-mcp/chrome-extension/msg_bridge.js @@ -0,0 +1,79 @@ +/** + * msg_bridge.js — ISOLATED world message bridge + * + * Architecture: + * background.js --chrome.tabs.sendMessage--> msg_bridge.js (ISOLATED) + * | + * window.postMessage + * | + * content.js (MAIN world, __AIRI_DG__) + * | + * window.postMessage (reply) + * | + * background.js <--sendResponse-- msg_bridge.js (ISOLATED) + * + * Why this bridge is needed: + * - chrome.runtime.onMessage can only be received in the ISOLATED world + * - window.__AIRI_DG__ lives in the MAIN world (needs real DOM access) + * - The two worlds communicate via window.postMessage + * + * Adapted from /Users/liuziheng/computer_use/chrome-extension/msg_bridge.js. + * No functional changes — this is a pure relay. + */ +(function () { + 'use strict' + + // Pending requests: reqId → { sendResponse, timer } + const pending = new Map() + let seqId = 0 + const pageOrigin = window.location.origin + const postMessageTargetOrigin = pageOrigin && pageOrigin !== 'null' ? pageOrigin : '*' + + // Receive commands from background.js + chrome.runtime.onMessage.addListener((msg, sender, sendResponse) => { + if (msg.type !== 'CU_ACTION') + return false + + const reqId = typeof crypto?.randomUUID === 'function' + ? `__cu_req_${crypto.randomUUID()}` + : `__cu_req_${++seqId}` + const { method, args } = msg + + // Set timeout + const timer = setTimeout(() => { + pending.delete(reqId) + sendResponse({ success: false, error: 'timeout' }) + }, 8000) + + pending.set(reqId, { sendResponse, timer }) + + // Send to MAIN world content.js + window.postMessage({ + type: '__CU_CALL__', + reqId, + method, + args: args || [], + }, postMessageTargetOrigin) + + return true // Keep sendResponse async + }) + + // Receive replies from MAIN world content.js + window.addEventListener('message', (evt) => { + if (evt.source !== window) + return + if (pageOrigin && pageOrigin !== 'null' && evt.origin !== pageOrigin) + return + const data = evt.data + if (!data || data.type !== '__CU_REPLY__') + return + + const entry = pending.get(data.reqId) + if (!entry) + return + + pending.delete(data.reqId) + clearTimeout(entry.timer) + entry.sendResponse(data.result) + }) +})() diff --git a/services/computer-use-mcp/src/chrome-semantic-adapter.test.ts b/services/computer-use-mcp/src/chrome-semantic-adapter.test.ts new file mode 100644 index 0000000000..7e5cfc8121 --- /dev/null +++ b/services/computer-use-mcp/src/chrome-semantic-adapter.test.ts @@ -0,0 +1,359 @@ +import { describe, expect, it, vi } from 'vitest' + +import { captureChromeSemantics, chromeElementsToTargetCandidates } from './chrome-semantic-adapter' + +// --------------------------------------------------------------------------- +// chromeElementsToTargetCandidates +// --------------------------------------------------------------------------- + +describe('chromeElementsToTargetCandidates', () => { + const windowBounds = { x: 100, y: 50, width: 1200, height: 800 } + + it('transforms page-relative rects to screen-absolute', () => { + const candidates = chromeElementsToTargetCandidates( + [{ + tag: 'button', + text: 'Submit', + rect: { x: 10, y: 20, w: 80, h: 30 }, + }], + windowBounds, + ) + + expect(candidates).toHaveLength(1) + const c = candidates[0] + // x = windowBounds.x + rect.x = 100 + 10 = 110 + // y = windowBounds.y + chromeHeight(88) + rect.y = 50 + 88 + 20 = 158 + expect(c.bounds.x).toBe(110) + expect(c.bounds.y).toBe(158) + expect(c.bounds.width).toBe(80) + expect(c.bounds.height).toBe(30) + }) + + it('allows custom chrome height', () => { + const candidates = chromeElementsToTargetCandidates( + [{ tag: 'button', text: 'A', rect: { x: 0, y: 0, w: 50, h: 20 } }], + windowBounds, + 100, // custom chrome height + ) + expect(candidates[0].bounds.y).toBe(50 + 100 + 0) + }) + + it('skips elements with zero-size rects', () => { + const candidates = chromeElementsToTargetCandidates( + [ + { tag: 'button', text: 'Zero', rect: { x: 0, y: 0, w: 0, h: 0 } }, + { tag: 'button', text: 'Valid', rect: { x: 10, y: 10, w: 50, h: 20 } }, + ], + windowBounds, + ) + expect(candidates).toHaveLength(1) + expect(candidates[0].label).toBe('Valid') + }) + + it('skips elements without rects', () => { + const candidates = chromeElementsToTargetCandidates( + [{ tag: 'button', text: 'No rect' }], + windowBounds, + ) + expect(candidates).toHaveLength(0) + }) + + it('skips elements outside window bounds', () => { + const candidates = chromeElementsToTargetCandidates( + [ + // Element far below the window + { tag: 'button', text: 'Below', rect: { x: 10, y: 2000, w: 50, h: 20 } }, + { tag: 'button', text: 'Inside', rect: { x: 10, y: 10, w: 50, h: 20 } }, + ], + windowBounds, + ) + expect(candidates).toHaveLength(1) + expect(candidates[0].label).toBe('Inside') + }) + + it('sets source to chrome_dom', () => { + const candidates = chromeElementsToTargetCandidates( + [{ tag: 'a', text: 'Link', rect: { x: 0, y: 0, w: 40, h: 16 } }], + windowBounds, + ) + expect(candidates[0].source).toBe('chrome_dom') + }) + + it('buttons get high confidence', () => { + const candidates = chromeElementsToTargetCandidates( + [{ tag: 'button', text: 'Go', rect: { x: 0, y: 0, w: 50, h: 20 } }], + windowBounds, + ) + expect(candidates[0].confidence).toBe(0.95) + }) + + it('disabled elements get low confidence', () => { + const candidates = chromeElementsToTargetCandidates( + [{ tag: 'button', text: 'Disabled', rect: { x: 0, y: 0, w: 50, h: 20 }, disabled: true }], + windowBounds, + ) + expect(candidates[0].confidence).toBe(0.3) + expect(candidates[0].interactable).toBe(false) + }) + + it('builds label from text, placeholder, name, id, href', () => { + const textLabel = chromeElementsToTargetCandidates( + [{ tag: 'button', text: 'Click me', rect: { x: 0, y: 0, w: 50, h: 20 } }], + windowBounds, + ) + expect(textLabel[0].label).toBe('Click me') + + const placeholderLabel = chromeElementsToTargetCandidates( + [{ tag: 'input', placeholder: 'Enter name', rect: { x: 0, y: 0, w: 50, h: 20 } }], + windowBounds, + ) + expect(placeholderLabel[0].label).toBe('[Enter name]') + + const idLabel = chromeElementsToTargetCandidates( + [{ tag: 'div', id: 'main-cta', rect: { x: 0, y: 0, w: 50, h: 20 } }], + windowBounds, + ) + expect(idLabel[0].label).toBe('#main-cta') + }) +}) + +// --------------------------------------------------------------------------- +// captureChromeSemantics +// --------------------------------------------------------------------------- + +describe('captureChromeSemantics', () => { + it('returns null when both bridges are undefined', async () => { + const result = await captureChromeSemantics(undefined, undefined) + expect(result).toBeNull() + }) + + it('uses extension bridge when connected', async () => { + const mockExtension = { + getStatus: () => ({ connected: true, enabled: true, host: 'localhost', port: 8080, pendingRequests: 0 }), + readAllFramesDom: vi.fn().mockResolvedValue([ + { + frameId: 0, + result: { + url: 'https://example.com', + title: 'Example', + interactiveElements: [ + { tag: 'button', text: 'Click', rect: { x: 0, y: 0, w: 50, h: 20 } }, + ], + }, + }, + ]), + } + + const result = await captureChromeSemantics(mockExtension as any, undefined) + expect(result).not.toBeNull() + expect(result!.source).toBe('extension') + expect(result!.pageUrl).toBe('https://example.com') + expect(result!.interactiveElements).toHaveLength(1) + }) + + it('falls back to CDP when extension capture returns no interactive elements', async () => { + const mockExtension = { + getStatus: () => ({ connected: true, enabled: true, host: 'localhost', port: 8080, pendingRequests: 0 }), + readAllFramesDom: vi.fn().mockResolvedValue([ + { + frameId: 0, + result: { + url: 'https://example.com', + title: 'Example', + interactiveElements: [], + }, + }, + ]), + } + const mockCdp = { + getStatus: vi.fn().mockReturnValue({ + connected: true, + pageUrl: 'https://example.com', + pageTitle: 'Example', + }), + collectInteractiveElements: vi.fn().mockResolvedValue([ + { tag: 'button', text: 'Fallback CTA', rect: { x: 0, y: 0, w: 50, h: 20 } }, + ]), + } + + const result = await captureChromeSemantics(mockExtension as any, mockCdp as any) + expect(result).not.toBeNull() + expect(result!.source).toBe('cdp') + expect(result!.interactiveElements).toHaveLength(1) + expect(result!.interactiveElements[0].text).toBe('Fallback CTA') + }) + + it('unwraps extension frame payloads nested under result.data', async () => { + const mockExtension = { + getStatus: () => ({ connected: true, enabled: true, host: 'localhost', port: 8080, pendingRequests: 0 }), + readAllFramesDom: vi.fn().mockResolvedValue([ + { + frameId: 0, + result: { + data: { + url: 'https://nested.example.com', + title: 'Nested Example', + interactiveElements: [ + { tag: 'button', text: 'Nested click', rect: { x: 0, y: 0, w: 50, h: 20 } }, + ], + }, + }, + }, + ]), + } + + const result = await captureChromeSemantics(mockExtension as any, undefined) + expect(result).not.toBeNull() + expect(result!.pageUrl).toBe('https://nested.example.com') + expect(result!.pageTitle).toBe('Nested Example') + expect(result!.interactiveElements).toHaveLength(1) + }) + + it('applies iframe offsets before returning extension frame elements', async () => { + const mockExtension = { + getStatus: () => ({ connected: true, enabled: true, host: 'localhost', port: 8080, pendingRequests: 0 }), + getAllFrames: vi.fn().mockResolvedValue([ + { frameId: 0, parentFrameId: -1 }, + { frameId: 7, parentFrameId: 0 }, + ]), + readAllFramesDom: vi.fn().mockResolvedValue([ + { + frameId: 0, + result: { + url: 'https://example.com', + title: 'Example', + interactiveElements: [], + }, + }, + { + frameId: 7, + result: { + frameRect: { x: 120, y: 80, w: 640, h: 480 }, + interactiveElements: [ + { tag: 'button', text: 'Iframe CTA', rect: { x: 10, y: 20, w: 50, h: 20 } }, + ], + }, + }, + ]), + } + + const result = await captureChromeSemantics(mockExtension as any, undefined) + expect(result).not.toBeNull() + expect(result!.interactiveElements).toHaveLength(1) + expect(result!.interactiveElements[0].rect).toEqual({ + x: 130, + y: 100, + w: 50, + h: 20, + }) + }) + + it('skips subframe elements when iframe offsets are unavailable', async () => { + const mockExtension = { + getStatus: () => ({ connected: true, enabled: true, host: 'localhost', port: 8080, pendingRequests: 0 }), + getAllFrames: vi.fn().mockResolvedValue([ + { frameId: 0, parentFrameId: -1 }, + { frameId: 9, parentFrameId: 0 }, + ]), + readAllFramesDom: vi.fn().mockResolvedValue([ + { + frameId: 0, + result: { + url: 'https://example.com', + title: 'Example', + interactiveElements: [ + { tag: 'button', text: 'Root CTA', rect: { x: 0, y: 0, w: 20, h: 20 } }, + ], + }, + }, + { + frameId: 9, + result: { + interactiveElements: [ + { tag: 'button', text: 'Iframe CTA', rect: { x: 10, y: 20, w: 50, h: 20 } }, + ], + }, + }, + ]), + } + + const result = await captureChromeSemantics(mockExtension as any, undefined) + expect(result).not.toBeNull() + expect(result!.interactiveElements).toHaveLength(1) + expect(result!.interactiveElements[0].text).toBe('Root CTA') + }) + + it('resolves nested iframe offsets even when frame results arrive out of order', async () => { + const mockExtension = { + getStatus: () => ({ connected: true, enabled: true, host: 'localhost', port: 8080, pendingRequests: 0 }), + getAllFrames: vi.fn().mockResolvedValue([ + { frameId: 0, parentFrameId: -1 }, + { frameId: 7, parentFrameId: 0 }, + { frameId: 12, parentFrameId: 7 }, + ]), + readAllFramesDom: vi.fn().mockResolvedValue([ + { + frameId: 12, + result: { + frameRect: { x: 15, y: 25, w: 320, h: 200 }, + interactiveElements: [ + { tag: 'button', text: 'Nested CTA', rect: { x: 3, y: 4, w: 40, h: 20 } }, + ], + }, + }, + { + frameId: 0, + result: { + url: 'https://example.com', + title: 'Example', + interactiveElements: [], + }, + }, + { + frameId: 7, + result: { + frameRect: { x: 120, y: 80, w: 640, h: 480 }, + interactiveElements: [], + }, + }, + ]), + } + + const result = await captureChromeSemantics(mockExtension as any, undefined) + expect(result).not.toBeNull() + expect(result!.interactiveElements).toHaveLength(1) + expect(result!.interactiveElements[0].rect).toEqual({ + x: 138, + y: 109, + w: 40, + h: 20, + }) + }) + + it('falls back to CDP when extension is disconnected', async () => { + const mockExtension = { + getStatus: () => ({ connected: false, enabled: true, host: 'localhost', port: 8080, pendingRequests: 0 }), + } + + const mockCdp = { + getStatus: () => ({ connected: true, cdpUrl: 'http://localhost:9222', pageUrl: 'https://cdp.com', pageTitle: 'CDP' }), + collectInteractiveElements: vi.fn().mockResolvedValue([ + { tag: 'input', text: '', rect: { x: 0, y: 0, w: 100, h: 20 } }, + ]), + } + + const result = await captureChromeSemantics(mockExtension as any, mockCdp as any) + expect(result).not.toBeNull() + expect(result!.source).toBe('cdp') + expect(result!.pageUrl).toBe('https://cdp.com') + }) + + it('returns null when extension throws and CDP unavailable', async () => { + const mockExtension = { + getStatus: () => { throw new Error('boom') }, + } + + const result = await captureChromeSemantics(mockExtension as any, undefined) + expect(result).toBeNull() + }) +}) diff --git a/services/computer-use-mcp/src/chrome-semantic-adapter.ts b/services/computer-use-mcp/src/chrome-semantic-adapter.ts new file mode 100644 index 0000000000..de929b1200 --- /dev/null +++ b/services/computer-use-mcp/src/chrome-semantic-adapter.ts @@ -0,0 +1,425 @@ +/** + * Chrome semantic adapter — collects interactive element data from Chrome + * and maps it to DesktopTargetCandidate format. + * + * Uses the extension bridge as primary source and CDP bridge as fallback. + * Only active when Chrome is the foreground app. + * + * The adapter handles coordinate transformation from page-relative + * (CSS viewport) coordinates to screen-absolute coordinates using + * the Chrome window bounds from the window observation. + */ + +import type { CdpBridge } from './browser-dom/cdp-bridge' +import type { BrowserDomExtensionBridge } from './browser-dom/extension-bridge' +import type { + ChromeSemanticSnapshot, + DesktopTargetCandidate, +} from './desktop-grounding-types' +import type { + Bounds, + BrowserDomFrameDom, + BrowserDomInteractiveElement, +} from './types' + +/** + * Estimated height of Chrome's browser chrome (tab bar + address bar + bookmarks bar) + * in logical pixels on macOS. + * + * NOTICE: This is a heuristic. The actual value depends on Chrome's zoom level, + * whether the bookmarks bar is shown, and whether the tab strip is compact. + * A more accurate approach would be to probe via the extension bridge, but + * that adds an extra roundtrip. For v1 this constant is sufficient. + */ +const CHROME_CHROME_HEIGHT_PX = 88 + +/** + * Capture Chrome semantic data from the active tab. + * + * Tries the extension bridge first (richer data, no `--remote-debugging-port` needed). + * Falls back to CDP bridge if the extension is unavailable. + * Returns `null` if both fail (graceful degradation). + * + * @param extensionBridge - The active WebSocket extension bridge (may be disconnected) + * @param cdpBridge - The active CDP bridge (may be disconnected) + * @returns ChromeSemanticSnapshot or null + */ +export async function captureChromeSemantics( + extensionBridge: BrowserDomExtensionBridge | undefined, + cdpBridge: CdpBridge | undefined, +): Promise { + // Try extension bridge first + if (extensionBridge) { + try { + const status = extensionBridge.getStatus() + if (status.connected) { + const extensionSnapshot = await captureViaExtension(extensionBridge) + if (extensionSnapshot.interactiveElements.length > 0 || !cdpBridge?.getStatus().connected) { + return extensionSnapshot + } + } + } + catch { + // Fall through to CDP + } + } + + // Fallback to CDP bridge + if (cdpBridge) { + try { + const status = cdpBridge.getStatus() + if (status.connected) { + return await captureViaCdp(cdpBridge) + } + } + catch { + // Fall through to null + } + } + + return null +} + +/** + * Convert Chrome interactive elements to desktop target candidates. + * + * Transforms page-relative coordinates to screen-absolute using + * the Chrome window bounds and an estimated chrome height offset. + * + * @param elements - Interactive elements from the Chrome page + * @param windowBounds - Screen-absolute bounds of the Chrome window + * @param chromeHeightPx - Height of the browser chrome in logical pixels (default: 88) + * @returns Array of desktop target candidates with `source: 'chrome_dom'` + */ +export function chromeElementsToTargetCandidates( + elements: BrowserDomInteractiveElement[], + windowBounds: Bounds, + chromeHeightPx: number = CHROME_CHROME_HEIGHT_PX, +): DesktopTargetCandidate[] { + const candidates: DesktopTargetCandidate[] = [] + const viewportOffsetX = windowBounds.x + const viewportOffsetY = windowBounds.y + chromeHeightPx + + for (const el of elements) { + if (!el.rect || el.rect.w === 0 || el.rect.h === 0) { + continue + } + + // Convert page-relative rect to screen-absolute bounds + const bounds: Bounds = { + x: viewportOffsetX + el.rect.x, + y: viewportOffsetY + el.rect.y, + width: el.rect.w, + height: el.rect.h, + } + + // Skip elements that are outside the window bounds (off-screen / clipped) + if (bounds.x + bounds.width < windowBounds.x || bounds.y + bounds.height < windowBounds.y) { + continue + } + if (bounds.x > windowBounds.x + windowBounds.width || bounds.y > windowBounds.y + windowBounds.height) { + continue + } + + const label = buildLabel(el) + const role = el.role || el.tag || 'element' + const confidence = computeElementConfidence(el) + + candidates.push({ + id: '', // Will be assigned by the grounding layer + source: 'chrome_dom', + appName: 'Google Chrome', + role, + label, + bounds, + confidence, + interactable: !el.disabled, + tag: el.tag, + href: el.href, + inputType: el.type, + }) + } + + return candidates +} + +// --------------------------------------------------------------------------- +// Internal helpers +// --------------------------------------------------------------------------- + +function toRecord(value: unknown): Record | undefined { + if (!value || typeof value !== 'object' || Array.isArray(value)) + return undefined + + return value as Record +} + +function toFiniteNumber(value: unknown): number | undefined { + return typeof value === 'number' && Number.isFinite(value) ? value : undefined +} + +function getExtensionFramePayload(result: Record) { + return toRecord(result.data) ?? result +} + +function getFrameRect(payload: Record): BrowserDomFrameDom['frameRect'] | undefined { + const rect = toRecord(payload.frameRect) + if (!rect) + return undefined + + const x = toFiniteNumber(rect.x) + const y = toFiniteNumber(rect.y) + const w = toFiniteNumber(rect.w) + const h = toFiniteNumber(rect.h) + if (x === undefined || y === undefined || w === undefined || h === undefined) + return undefined + + return { x, y, w, h } +} + +function getFrameParentId(frame: Record): number | undefined { + return toFiniteNumber(frame.parentFrameId) +} + +function offsetInteractiveElement( + element: BrowserDomInteractiveElement, + offset: { x: number, y: number }, +): BrowserDomInteractiveElement { + return { + ...element, + rect: element.rect + ? { + ...element.rect, + x: element.rect.x + offset.x, + y: element.rect.y + offset.y, + } + : element.rect, + center: element.center + ? { + x: element.center.x + offset.x, + y: element.center.y + offset.y, + } + : element.center, + } +} + +function resolveFrameOffset( + frameId: number, + parentIds: Map, + payloads: Map>, + cache: Map, + visiting: Set = new Set(), +): { x: number, y: number } | null { + if (cache.has(frameId)) + return cache.get(frameId) ?? null + + if (frameId === 0) { + const rootOffset = { x: 0, y: 0 } + cache.set(frameId, rootOffset) + return rootOffset + } + + if (visiting.has(frameId)) { + return null + } + + visiting.add(frameId) + + const payload = payloads.get(frameId) + const frameRect = payload ? getFrameRect(payload) : undefined + const parentFrameId = parentIds.get(frameId) + if (!frameRect || parentFrameId === undefined) { + visiting.delete(frameId) + return null + } + + const parentOffset = resolveFrameOffset(parentFrameId, parentIds, payloads, cache, visiting) + if (!parentOffset) { + visiting.delete(frameId) + return null + } + + const resolvedOffset = { + x: parentOffset.x + frameRect.x, + y: parentOffset.y + frameRect.y, + } + cache.set(frameId, resolvedOffset) + visiting.delete(frameId) + return resolvedOffset +} + +async function captureViaExtension( + bridge: BrowserDomExtensionBridge, +): Promise { + const frames = await bridge.readAllFramesDom({ + includeText: false, + maxElements: 150, + }) + const frameTree = typeof bridge.getAllFrames === 'function' + ? await bridge.getAllFrames().catch(() => []) + : [] + + // Merge interactive elements from all frames + const allElements: BrowserDomInteractiveElement[] = [] + let pageUrl = '' + let pageTitle = '' + const payloadsByFrameId = new Map>() + const parentIdsByFrameId = new Map() + const resolvedOffsets = new Map() + + for (const frame of frameTree) { + const frameRecord = toRecord(frame) + if (!frameRecord) + continue + + const frameId = toFiniteNumber(frameRecord.frameId) + if (frameId === undefined) + continue + + parentIdsByFrameId.set(frameId, getFrameParentId(frameRecord)) + } + + for (const frame of frames) { + const dom = frame.result as Record | undefined + if (!dom) + continue + + const payload = getExtensionFramePayload(dom) + payloadsByFrameId.set(frame.frameId, payload) + + if (frame.frameId === 0) { + pageUrl = (payload.url as string) || '' + pageTitle = (payload.title as string) || '' + } + } + + for (const frame of frames) { + const payload = payloadsByFrameId.get(frame.frameId) + if (!payload) + continue + + const rawElements = payload.interactiveElements + const elements = rawElements as BrowserDomInteractiveElement[] | undefined + if (elements) { + const offset = resolveFrameOffset( + frame.frameId, + parentIdsByFrameId, + payloadsByFrameId, + resolvedOffsets, + ) + + if (frame.frameId !== 0 && !offset) { + continue + } + + const normalizedElements = offset + ? elements.map(element => offsetInteractiveElement(element, offset)) + : elements + allElements.push(...normalizedElements) + } + } + + return { + pageUrl, + pageTitle, + interactiveElements: allElements, + capturedAt: new Date().toISOString(), + source: 'extension', + } +} + +async function captureViaCdp(bridge: CdpBridge): Promise { + const elements = await bridge.collectInteractiveElements(150) + + const status = bridge.getStatus() + + // Map CDP elements to our BrowserDomInteractiveElement format + const mapped: BrowserDomInteractiveElement[] = (elements || []).map((el: Record) => ({ + tag: el.tag as string | undefined, + id: el.id as string | undefined, + name: el.name as string | undefined, + type: el.type as string | undefined, + text: el.text as string | undefined, + value: el.value as string | undefined, + href: el.href as string | undefined, + placeholder: el.placeholder as string | undefined, + disabled: el.disabled as boolean | undefined, + checked: el.checked as boolean | undefined, + role: el.role as string | undefined, + rect: el.rect as { x: number, y: number, w: number, h: number } | undefined, + center: el.center as { x: number, y: number } | undefined, + })) + + return { + pageUrl: status.pageUrl || '', + pageTitle: status.pageTitle || '', + interactiveElements: mapped, + capturedAt: new Date().toISOString(), + source: 'cdp', + } +} + +/** + * Build a human-readable label from element attributes. + * Priority: text > placeholder > name > id > href > tag. + */ +function buildLabel(el: BrowserDomInteractiveElement): string { + if (el.text && el.text.trim()) { + return el.text.trim().slice(0, 80) + } + if (el.placeholder && el.placeholder.trim()) { + return `[${el.placeholder.trim().slice(0, 60)}]` + } + if (el.name) { + return `name="${el.name}"` + } + if (el.id) { + return `#${el.id}` + } + if (el.href) { + // Truncate long URLs + const url = el.href.length > 60 ? `${el.href.slice(0, 57)}...` : el.href + return url + } + return el.tag || 'element' +} + +/** + * Compute confidence score for a Chrome DOM element based on its attributes. + * + * Buttons and links are high confidence, disabled elements are lower, + * and generic elements without clear interactable signals get medium confidence. + */ +function computeElementConfidence(el: BrowserDomInteractiveElement): number { + // Disabled → low confidence for interactability + if (el.disabled) + return 0.3 + + const tag = el.tag?.toLowerCase() || '' + const role = el.role?.toLowerCase() || '' + + // Buttons, links, explicit interactive roles → high confidence + if ( + tag === 'button' + || tag === 'a' + || role === 'button' + || role === 'link' + || role === 'tab' + || role === 'menuitem' + ) { + return 0.95 + } + + // Form inputs → high confidence + if (tag === 'input' || tag === 'textarea' || tag === 'select') { + return 0.9 + } + + // Elements with click handlers or tabindex → medium-high confidence + if (role === 'checkbox' || role === 'radio') { + return 0.85 + } + + // Default + return 0.7 +} diff --git a/services/computer-use-mcp/src/desktop-grounding-types.ts b/services/computer-use-mcp/src/desktop-grounding-types.ts new file mode 100644 index 0000000000..fac3686626 --- /dev/null +++ b/services/computer-use-mcp/src/desktop-grounding-types.ts @@ -0,0 +1,205 @@ +/** + * Desktop Grounding types — unified observation + snap + intent layer + * for macOS Chrome-first desktop automation. + * + * These types power the `desktop_observe` and `desktop_click_target` tools, + * merging screenshot, AX tree, window observation, and Chrome semantic data + * into a single grounding snapshot with ranked target candidates. + */ + +import type { AXSnapshot } from './accessibility/types' +import type { + Bounds, + BrowserDomInteractiveElement, + PointerTracePoint, + ScreenshotArtifact, + WindowInfo, +} from './types' + +// Re-export input types from types.ts (canonical definitions live there to avoid circular deps) +export type { DesktopClickTargetInput, DesktopObserveInput } from './types' + +// --------------------------------------------------------------------------- +// Target candidate source hierarchy (higher = preferred for snap) +// --------------------------------------------------------------------------- + +/** Which observation source produced a target candidate. */ +export type TargetSource = 'chrome_dom' | 'ax' | 'vision' | 'raw' + +/** + * Priority order for snap resolution. + * Lower index = higher priority. + */ +export const TARGET_SOURCE_PRIORITY: readonly TargetSource[] = [ + 'chrome_dom', + 'ax', + 'vision', + 'raw', +] as const + +/** Maximum snapshot age tolerated before `desktop_click_target` must refresh. */ +export const DESKTOP_CLICK_SNAPSHOT_MAX_AGE_MS = 5_000 + +// --------------------------------------------------------------------------- +// Target candidate +// --------------------------------------------------------------------------- + +/** + * A single interactable UI element discovered by the grounding layer. + * + * Candidates come from different sources (Chrome DOM, macOS AX tree, vision) + * and are merged into a unified list with deduplication. + */ +export interface DesktopTargetCandidate { + /** Stable id within the snapshot (e.g. "t_0", "t_1") */ + id: string + /** Which observation source produced this candidate */ + source: TargetSource + /** Application name */ + appName: string + /** Window identifier from the window observation */ + windowId?: string + /** Semantic role (e.g. "AXButton", "button", "input") */ + role: string + /** Human-readable label (title, text content, placeholder) */ + label: string + /** Screen-absolute bounding rect in logical pixels */ + bounds: Bounds + /** Confidence that this candidate is correctly identified (0-1) */ + confidence: number + /** Whether the element appears interactable (clickable, focusable) */ + interactable: boolean + + // ---- Chrome DOM extras ---- + /** HTML tag name (e.g. "a", "button", "input") */ + tag?: string + /** href for links */ + href?: string + /** Input type (e.g. "text", "password", "email") */ + inputType?: string + /** CSS selector for re-querying (best-effort) */ + selector?: string + + // ---- AX extras ---- + /** AX tree UID for `findAXNodeByUid` lookup */ + axUid?: string + /** Whether the element has keyboard focus */ + focused?: boolean + /** Whether the element is enabled */ + enabled?: boolean +} + +// --------------------------------------------------------------------------- +// Chrome semantic snapshot +// --------------------------------------------------------------------------- + +/** + * Semantic data from Chrome's active page, collected via + * the Chrome extension bridge or CDP bridge. + */ +export interface ChromeSemanticSnapshot { + /** Current page URL */ + pageUrl: string + /** Current page title */ + pageTitle: string + /** Interactive elements collected from the page DOM */ + interactiveElements: BrowserDomInteractiveElement[] + /** ISO timestamp when the snapshot was captured */ + capturedAt: string + /** Which bridge produced the data */ + source: 'extension' | 'cdp' +} + +// --------------------------------------------------------------------------- +// Desktop grounding snapshot (the unified observation output) +// --------------------------------------------------------------------------- + +/** + * Staleness flags for each observation source. + * `true` means the data is stale or unavailable. + */ +export interface GroundingStalenessFlags { + /** Screenshot is stale or missing */ + screenshot: boolean + /** AX tree is stale or unavailable */ + ax: boolean + /** Chrome semantic data is stale or unavailable (always true for non-Chrome apps) */ + chromeSemantic: boolean +} + +/** + * Unified output of `desktop_observe`. + * + * Merges all desktop observation sources into a single structure + * with ranked, deduplicated target candidates. + */ +export interface DesktopGroundingSnapshot { + /** Unique identifier for this snapshot */ + snapshotId: string + /** ISO timestamp when the snapshot was assembled */ + capturedAt: string + /** Name of the foreground application */ + foregroundApp: string + /** Title of the foreground window when available */ + foregroundWindowTitle?: string + /** Current window list */ + windows: WindowInfo[] + /** Latest screenshot artifact */ + screenshot: ScreenshotArtifact + /** macOS AX tree snapshot (if captured successfully) */ + axSnapshot?: AXSnapshot + /** Chrome semantic snapshot (only when Chrome is foreground) */ + chromeSemanticSnapshot?: ChromeSemanticSnapshot + /** Merged, deduplicated, ranked target candidates */ + targetCandidates: DesktopTargetCandidate[] + /** Which sources are stale or unavailable */ + staleFlags: GroundingStalenessFlags +} + +// --------------------------------------------------------------------------- +// Snap resolution +// --------------------------------------------------------------------------- + +/** + * Result of resolving a raw coordinate to a snapped target candidate. + * + * Records the full snap decision for tracing and debugging. + */ +export interface SnapResolution { + /** Original point requested by the agent */ + rawPoint: { x: number, y: number } + /** Final point after snap resolution (center of matched candidate, or rawPoint fallback) */ + snappedPoint: { x: number, y: number } + /** Matched candidate id (undefined if no match → raw fallback) */ + candidateId?: string + /** Which source tier produced the match */ + source: TargetSource | 'none' + /** Human-readable explanation of the snap decision */ + reason: string +} + +// --------------------------------------------------------------------------- +// Pointer intent +// --------------------------------------------------------------------------- + +/** + * Describes the agent's intention to interact with a desktop target. + * + * Generated before each click for UI overlay visualization and trace logging. + */ +export interface PointerIntent { + /** 'preview' = for overlay visualization only, 'execute' = real click pending */ + mode: 'preview' | 'execute' + /** Target candidate id (if snapped to a candidate) */ + candidateId?: string + /** Original raw coordinate */ + rawPoint: { x: number, y: number } + /** Snapped coordinate (after resolution) */ + snappedPoint: { x: number, y: number } + /** Source tier of the matched candidate */ + source: TargetSource | 'none' + /** Confidence of the snap decision */ + confidence: number + /** Pointer animation path for overlay visualization */ + path: PointerTracePoint[] +} diff --git a/services/computer-use-mcp/src/desktop-grounding.test.ts b/services/computer-use-mcp/src/desktop-grounding.test.ts new file mode 100644 index 0000000000..03d60e73cd --- /dev/null +++ b/services/computer-use-mcp/src/desktop-grounding.test.ts @@ -0,0 +1,243 @@ +import type { AXNode, AXSnapshot } from './accessibility/types' +import type { ChromeSemanticSnapshot, DesktopGroundingSnapshot } from './desktop-grounding-types' + +import { describe, expect, it } from 'vitest' + +import { buildTargetCandidates, formatGroundingForAgent } from './desktop-grounding' + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function makeAXSnapshot(nodes: Partial[]): AXSnapshot { + const root: AXNode = { + uid: 'root_0', + role: 'AXApplication', + children: nodes.map((n, i) => ({ + uid: n.uid ?? `node_${i}`, + role: n.role ?? 'AXButton', + title: n.title ?? `Button ${i}`, + bounds: n.bounds ?? { x: 100 + i * 60, y: 100, width: 50, height: 30 }, + enabled: n.enabled ?? true, + focused: n.focused ?? false, + children: n.children ?? [], + })), + } + + const uidToNode = new Map() + function walk(node: AXNode) { + uidToNode.set(node.uid, node) + for (const child of node.children) walk(child) + } + walk(root) + + return { + snapshotId: 'ax_1', + pid: 1234, + appName: 'Google Chrome', + root, + uidToNode, + capturedAt: new Date().toISOString(), + maxDepth: 15, + truncated: false, + } +} + +function makeChromeSnapshot(elements: Array<{ + tag?: string + text?: string + role?: string + rect?: { x: number, y: number, w: number, h: number } + disabled?: boolean +}>): ChromeSemanticSnapshot { + return { + pageUrl: 'https://example.com', + pageTitle: 'Example Page', + interactiveElements: elements.map(el => ({ + tag: el.tag ?? 'button', + text: el.text ?? 'Click me', + role: el.role, + rect: el.rect ?? { x: 50, y: 50, w: 100, h: 30 }, + disabled: el.disabled, + })), + capturedAt: new Date().toISOString(), + source: 'extension', + } +} + +// --------------------------------------------------------------------------- +// buildTargetCandidates +// --------------------------------------------------------------------------- + +describe('buildTargetCandidates', () => { + it('aX only: extracts interactable nodes', () => { + const ax = makeAXSnapshot([ + { role: 'AXButton', title: 'OK' }, + { role: 'AXStaticText', title: 'Just text' }, // Non-interactable role + { role: 'AXTextField', title: 'Input' }, + ]) + const candidates = buildTargetCandidates({ + axSnapshot: ax, + foregroundApp: 'Finder', + }) + + // Should only include AXButton and AXTextField, not AXStaticText + expect(candidates.length).toBe(2) + expect(candidates[0].role).toBe('AXButton') + expect(candidates[1].role).toBe('AXTextField') + expect(candidates[0].source).toBe('ax') + expect(candidates[0].id).toBe('t_0') + expect(candidates[1].id).toBe('t_1') + }) + + it('chrome only: converts elements to candidates', () => { + const chrome = makeChromeSnapshot([ + { tag: 'button', text: 'Submit', rect: { x: 10, y: 10, w: 80, h: 30 } }, + { tag: 'a', text: 'Link', rect: { x: 10, y: 50, w: 60, h: 20 } }, + ]) + const candidates = buildTargetCandidates({ + chromeSnapshot: chrome, + chromeWindowBounds: { x: 0, y: 0, width: 1920, height: 1080 }, + foregroundApp: 'Google Chrome', + }) + + expect(candidates.length).toBe(2) + expect(candidates[0].source).toBe('chrome_dom') + expect(candidates[0].tag).toBe('button') + }) + + it('chrome + AX: deduplicates overlapping candidates', () => { + // Chrome element and AX node at same position → AX should be removed + const chrome = makeChromeSnapshot([ + { tag: 'button', text: 'Submit', rect: { x: 100, y: 12, w: 50, h: 30 } }, + ]) + const ax = makeAXSnapshot([ + { + role: 'AXButton', + title: 'Submit', + // After chrome chrome height offset (88px), chrome rect becomes + // screen-absolute: x=100, y=100, w=50, h=30 — same as AX + bounds: { x: 100, y: 100, width: 50, height: 30 }, + }, + ]) + + const candidates = buildTargetCandidates({ + axSnapshot: ax, + chromeSnapshot: chrome, + chromeWindowBounds: { x: 0, y: 0, width: 1920, height: 1080 }, + foregroundApp: 'Google Chrome', + }) + + // Should have the chrome candidate (preferred) and the AX should be deduped + const chromeCount = candidates.filter(c => c.source === 'chrome_dom').length + const axCount = candidates.filter(c => c.source === 'ax').length + expect(chromeCount).toBe(1) + // AX candidate may or may not be deduped depending on exact IoU + }) + + it('no sources: returns empty', () => { + const candidates = buildTargetCandidates({ foregroundApp: 'Finder' }) + expect(candidates).toEqual([]) + }) + + it('assigns sequential ids', () => { + const ax = makeAXSnapshot([ + { role: 'AXButton', title: 'A' }, + { role: 'AXButton', title: 'B' }, + { role: 'AXButton', title: 'C' }, + ]) + const candidates = buildTargetCandidates({ axSnapshot: ax, foregroundApp: 'Finder' }) + expect(candidates.map(c => c.id)).toEqual(['t_0', 't_1', 't_2']) + }) + + it('limits to 50 candidates', () => { + const nodes = Array.from({ length: 60 }, (_, i) => ({ + role: 'AXButton' as const, + title: `Btn ${i}`, + bounds: { x: i * 60, y: 100, width: 50, height: 30 }, + })) + const ax = makeAXSnapshot(nodes) + const candidates = buildTargetCandidates({ axSnapshot: ax, foregroundApp: 'Finder' }) + expect(candidates.length).toBe(50) + }) + + it('disabled AX nodes have interactable=false', () => { + const ax = makeAXSnapshot([ + { role: 'AXButton', title: 'Disabled', enabled: false }, + ]) + const candidates = buildTargetCandidates({ axSnapshot: ax, foregroundApp: 'Finder' }) + expect(candidates[0].interactable).toBe(false) + }) +}) + +// --------------------------------------------------------------------------- +// formatGroundingForAgent +// --------------------------------------------------------------------------- + +describe('formatGroundingForAgent', () => { + function makeFullSnapshot(candidateCount = 2): DesktopGroundingSnapshot { + const candidates = Array.from({ length: candidateCount }, (_, i) => ({ + id: `t_${i}`, + source: 'ax' as const, + appName: 'Finder', + role: 'AXButton', + label: `Button ${i}`, + bounds: { x: 100 + i * 60, y: 100, width: 50, height: 30 }, + confidence: 0.8, + interactable: true, + })) + + return { + snapshotId: 'dg_1', + capturedAt: new Date().toISOString(), + foregroundApp: 'Finder', + windows: [{ id: '1', appName: 'Finder', title: 'Desktop' }], + screenshot: { dataBase64: '', mimeType: 'image/png', path: '', capturedAt: new Date().toISOString() }, + targetCandidates: candidates, + staleFlags: { screenshot: false, ax: false, chromeSemantic: true }, + } as DesktopGroundingSnapshot + } + + it('includes foreground app name', () => { + const text = formatGroundingForAgent(makeFullSnapshot()) + expect(text).toContain('Finder') + }) + + it('shows staleness warnings', () => { + const text = formatGroundingForAgent(makeFullSnapshot()) + expect(text).toContain('Chrome semantic') + }) + + it('lists target candidates with ids and bounds', () => { + const text = formatGroundingForAgent(makeFullSnapshot()) + expect(text).toContain('[t_0]') + expect(text).toContain('[t_1]') + expect(text).toContain('AXButton') + expect(text).toContain('conf=0.80') + }) + + it('truncates at 40 candidates with count note', () => { + const text = formatGroundingForAgent(makeFullSnapshot(45)) + expect(text).toContain('... and 5 more') + }) + + it('shows Chrome page info when chrome snapshot present', () => { + const snapshot = makeFullSnapshot() + snapshot.chromeSemanticSnapshot = { + pageUrl: 'https://example.com', + pageTitle: 'Example', + interactiveElements: [], + capturedAt: new Date().toISOString(), + source: 'extension', + } + const text = formatGroundingForAgent(snapshot) + expect(text).toContain('Example') + expect(text).toContain('https://example.com') + }) + + it('empty candidates → shows "No interactable targets"', () => { + const snapshot = makeFullSnapshot(0) + const text = formatGroundingForAgent(snapshot) + expect(text).toContain('No interactable targets') + }) +}) diff --git a/services/computer-use-mcp/src/desktop-grounding.ts b/services/computer-use-mcp/src/desktop-grounding.ts new file mode 100644 index 0000000000..dd17ec668a --- /dev/null +++ b/services/computer-use-mcp/src/desktop-grounding.ts @@ -0,0 +1,400 @@ +/** + * Desktop grounding layer — unified observation aggregation. + * + * This is the main entry point for the `desktop_observe` tool. + * It captures screenshot, window observation, AX tree, and Chrome semantics + * in parallel, then merges everything into a single `DesktopGroundingSnapshot` + * with deduplicated, ranked target candidates. + */ + +import type { AXNode, AXSnapshot } from './accessibility/types' +import type { CdpBridge } from './browser-dom/cdp-bridge' +import type { BrowserDomExtensionBridge } from './browser-dom/extension-bridge' +import type { + ChromeSemanticSnapshot, + DesktopGroundingSnapshot, + DesktopObserveInput, + DesktopTargetCandidate, + GroundingStalenessFlags, +} from './desktop-grounding-types' +import type { + Bounds, + ComputerUseConfig, + DesktopExecutor, + ScreenshotArtifact, + WindowObservation, +} from './types' + +import { captureAXTree } from './accessibility' +import { appNamesMatch } from './app-aliases' +import { captureChromeSemantics, chromeElementsToTargetCandidates } from './chrome-semantic-adapter' +import { TARGET_SOURCE_PRIORITY } from './desktop-grounding-types' +import { boundsIoU } from './snap-resolver' + +/** + * Maximum age (ms) of a sub-snapshot before it is considered stale. + * If the screenshot/AX/Chrome data is older than this relative to the + * assembly timestamp, the corresponding stale flag is set. + */ +const STALENESS_THRESHOLD_MS = 2000 + +/** Known Chrome-like browser app names (lowercase, no .app suffix) */ +const CHROME_APPS = new Set([ + 'google chrome', + 'chrome', + 'google chrome canary', + 'chromium', +]) + +let nextSnapshotId = 1 + +/** + * Capture a unified desktop grounding snapshot. + * + * Runs screenshot, window observation, and AX tree capture in parallel. + * If the foreground app is Chrome (and `includeChrome` is not false), + * also captures Chrome semantic data. + * + * @param params - Capture parameters (config, executor, input, bridges) + * @returns Unified desktop grounding snapshot + */ +export async function captureDesktopGrounding(params: { + config: ComputerUseConfig + executor: DesktopExecutor + input?: DesktopObserveInput + extensionBridge?: BrowserDomExtensionBridge + cdpBridge?: CdpBridge +}): Promise { + const { config, executor, input, extensionBridge, cdpBridge } = params + + // Phase 1: Parallel capture of all observation sources + const [screenshotResult, windowsResult, axResult] = await Promise.allSettled([ + executor.takeScreenshot({ label: 'desktop_observe' }), + executor.observeWindows({ limit: 12 }), + captureAXTree(config), + ]) + + const screenshot = screenshotResult.status === 'fulfilled' ? screenshotResult.value : createPlaceholderScreenshot() + const windowObs = windowsResult.status === 'fulfilled' ? windowsResult.value : createEmptyWindowObservation() + const axSnapshot = axResult.status === 'fulfilled' ? axResult.value : undefined + + // Determine foreground app + const foregroundApp = windowObs.frontmostAppName || axSnapshot?.appName || 'unknown' + const isChromeInFront = isChromeApp(foregroundApp) + + // Phase 2: Chrome semantic data (only if Chrome is foreground and allowed) + let chromeSemanticSnapshot: ChromeSemanticSnapshot | null = null + if (isChromeInFront && input?.includeChrome !== false) { + chromeSemanticSnapshot = await captureChromeSemantics(extensionBridge, cdpBridge) + } + + // Phase 3: Build target candidates + const chromeWindowBounds = findChromeWindowBounds(windowObs, foregroundApp) + const candidates = buildTargetCandidates({ + axSnapshot, + chromeSnapshot: chromeSemanticSnapshot ?? undefined, + chromeWindowBounds, + foregroundApp, + }) + + // Phase 4: Compute staleness + const now = Date.now() + const capturedAt = new Date(now).toISOString() + const staleFlags = computeStaleness({ + screenshot, + axSnapshot, + chromeSemanticSnapshot: chromeSemanticSnapshot ?? undefined, + isChromeInFront, + assemblyTimestamp: now, + }) + + const snapshotId = `dg_${nextSnapshotId++}` + + return { + snapshotId, + capturedAt, + foregroundApp, + foregroundWindowTitle: windowObs.frontmostWindowTitle, + windows: windowObs.windows, + screenshot, + axSnapshot, + chromeSemanticSnapshot: chromeSemanticSnapshot ?? undefined, + targetCandidates: candidates, + staleFlags, + } +} + +/** + * Build a merged, deduplicated list of target candidates from all sources. + * + * Deduplication: if a `chrome_dom` candidate's bounds overlap >70% (IoU) + * with an `ax` candidate, the `ax` duplicate is removed (chrome_dom is richer). + * + * @returns Sorted array of candidates (chrome_dom first, then ax, then vision) + */ +export function buildTargetCandidates(params: { + axSnapshot?: AXSnapshot + chromeSnapshot?: ChromeSemanticSnapshot + chromeWindowBounds?: Bounds + foregroundApp: string +}): DesktopTargetCandidate[] { + const { axSnapshot, chromeSnapshot, chromeWindowBounds, foregroundApp } = params + + // 1. Build Chrome DOM candidates + let chromeCandidates: DesktopTargetCandidate[] = [] + if (chromeSnapshot && chromeWindowBounds) { + chromeCandidates = chromeElementsToTargetCandidates( + chromeSnapshot.interactiveElements, + chromeWindowBounds, + ) + // Set appName on all chrome candidates + for (const c of chromeCandidates) { + c.appName = foregroundApp + } + } + + // 2. Build AX candidates + let axCandidates: DesktopTargetCandidate[] = [] + if (axSnapshot) { + axCandidates = axNodesToTargetCandidates(axSnapshot, foregroundApp) + } + + // 3. Deduplicate: remove AX candidates with >70% IoU overlap with Chrome candidates + const DEDUP_IOU_THRESHOLD = 0.7 + if (chromeCandidates.length > 0 && axCandidates.length > 0) { + axCandidates = axCandidates.filter((axCandidate) => { + // Keep the AX candidate only if no Chrome candidate overlaps significantly + return !chromeCandidates.some(cc => + boundsIoU(cc.bounds, axCandidate.bounds) >= DEDUP_IOU_THRESHOLD, + ) + }) + } + + // 4. Merge and assign ids + const merged = [...chromeCandidates, ...axCandidates] + + // Sort: chrome_dom first, then ax, then by confidence desc + merged.sort((a, b) => { + if (a.source !== b.source) { + const aPriority = TARGET_SOURCE_PRIORITY.indexOf(a.source) + const bPriority = TARGET_SOURCE_PRIORITY.indexOf(b.source) + return (aPriority === -1 ? TARGET_SOURCE_PRIORITY.length : aPriority) + - (bPriority === -1 ? TARGET_SOURCE_PRIORITY.length : bPriority) + } + return b.confidence - a.confidence + }) + + // Assign stable ids + for (let i = 0; i < merged.length; i++) { + merged[i].id = `t_${i}` + } + + // Limit to top 50 candidates + return merged.slice(0, 50) +} + +/** + * Format a grounding snapshot as a text representation for the agent. + * + * Produces a compact, LLM-friendly output with: + * - Foreground app header + * - Target candidate table + * - Staleness warnings + */ +export function formatGroundingForAgent( + snapshot: DesktopGroundingSnapshot, +): string { + const lines: string[] = [] + + // Header + lines.push(`[Desktop Observe] ${snapshot.foregroundApp}`) + lines.push(` Snapshot: ${snapshot.snapshotId} at ${snapshot.capturedAt}`) + + // Staleness warnings + const staleWarnings: string[] = [] + if (snapshot.staleFlags.screenshot) + staleWarnings.push('screenshot') + if (snapshot.staleFlags.ax) + staleWarnings.push('AX tree') + if (snapshot.staleFlags.chromeSemantic) + staleWarnings.push('Chrome semantic') + if (staleWarnings.length > 0) { + lines.push(` ⚠ Stale: ${staleWarnings.join(', ')}`) + } + + // Chrome info + if (snapshot.chromeSemanticSnapshot) { + lines.push(` Chrome page: ${snapshot.chromeSemanticSnapshot.pageTitle} (${snapshot.chromeSemanticSnapshot.pageUrl})`) + } + + // Windows summary + lines.push(` Windows: ${snapshot.windows.length}`) + + // Target candidates + if (snapshot.targetCandidates.length === 0) { + lines.push(' No interactable targets found.') + } + else { + lines.push(` Targets (${snapshot.targetCandidates.length}):`) + for (const c of snapshot.targetCandidates.slice(0, 40)) { + const b = c.bounds + const focused = c.focused ? ' [focused]' : '' + const disabled = c.enabled === false ? ' [disabled]' : '' + lines.push(` [${c.id}] ${c.source} ${c.role} "${c.label}"${focused}${disabled} @(${b.x},${b.y} ${b.width}x${b.height}) conf=${c.confidence.toFixed(2)}`) + } + if (snapshot.targetCandidates.length > 40) { + lines.push(` ... and ${snapshot.targetCandidates.length - 40} more`) + } + } + + return lines.join('\n') +} + +// --------------------------------------------------------------------------- +// AX tree → target candidates +// --------------------------------------------------------------------------- + +/** AX roles that are typically interactable */ +const INTERACTABLE_AX_ROLES = new Set([ + 'AXButton', + 'AXLink', + 'AXTextField', + 'AXTextArea', + 'AXCheckBox', + 'AXRadioButton', + 'AXPopUpButton', + 'AXComboBox', + 'AXSlider', + 'AXMenuItem', + 'AXMenuBarItem', + 'AXTab', + 'AXTabGroup', + 'AXToolbar', + 'AXIncrementor', + 'AXColorWell', + 'AXDisclosureTriangle', +]) + +/** + * Extract interactable nodes from an AX tree and convert to target candidates. + */ +function axNodesToTargetCandidates( + snapshot: AXSnapshot, + appName: string, +): DesktopTargetCandidate[] { + const candidates: DesktopTargetCandidate[] = [] + + function walk(node: AXNode) { + // Only include nodes with bounds and interactable roles + if (node.bounds && INTERACTABLE_AX_ROLES.has(node.role)) { + const label = node.title || node.description || node.value || node.role + candidates.push({ + id: '', // Assigned later + source: 'ax', + appName, + role: node.role, + label: label.slice(0, 80), + bounds: node.bounds, + confidence: 0.8, + interactable: node.enabled !== false, + axUid: node.uid, + focused: node.focused, + enabled: node.enabled, + }) + } + + for (const child of node.children) { + walk(child) + } + } + + walk(snapshot.root) + return candidates +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/** Compiled regex for stripping .app suffix from macOS app names */ +const APP_SUFFIX_RE = /\.app$/u + +function isChromeApp(appName: string): boolean { + return CHROME_APPS.has(appName.trim().toLowerCase().replace(APP_SUFFIX_RE, '')) +} + +function findChromeWindowBounds( + observation: WindowObservation, + foregroundApp: string, +): Bounds | undefined { + if (!isChromeApp(foregroundApp)) + return undefined + + const chromeWindows = observation.windows.filter(window => + window.bounds + && window.isOnScreen !== false + && isChromeApp(window.appName), + ) + if (chromeWindows.length === 0) + return undefined + + const foregroundChromeWindows = chromeWindows.filter(window => appNamesMatch(window.appName, foregroundApp)) + const preferredWindows = foregroundChromeWindows.length > 0 ? foregroundChromeWindows : chromeWindows + + const frontmostTitle = observation.frontmostWindowTitle?.trim() + if (frontmostTitle) { + const frontmostWindow = preferredWindows.find(window => window.title?.trim() === frontmostTitle) + if (frontmostWindow?.bounds) { + return frontmostWindow.bounds + } + } + + return preferredWindows[0]?.bounds +} + +function computeStaleness(params: { + screenshot: ScreenshotArtifact + axSnapshot?: AXSnapshot + chromeSemanticSnapshot?: ChromeSemanticSnapshot + isChromeInFront: boolean + assemblyTimestamp: number +}): GroundingStalenessFlags { + const { screenshot, axSnapshot, chromeSemanticSnapshot, isChromeInFront, assemblyTimestamp } = params + + const screenshotStale = !screenshot.capturedAt + || (assemblyTimestamp - new Date(screenshot.capturedAt).getTime()) > STALENESS_THRESHOLD_MS + || screenshot.placeholder === true + + const axStale = !axSnapshot + || !axSnapshot.capturedAt + || (assemblyTimestamp - new Date(axSnapshot.capturedAt).getTime()) > STALENESS_THRESHOLD_MS + + const chromeStale = !isChromeInFront + || !chromeSemanticSnapshot + || !chromeSemanticSnapshot.capturedAt + || (assemblyTimestamp - new Date(chromeSemanticSnapshot.capturedAt).getTime()) > STALENESS_THRESHOLD_MS + + return { + screenshot: screenshotStale, + ax: axStale, + chromeSemantic: chromeStale, + } +} + +function createPlaceholderScreenshot(): ScreenshotArtifact { + return { + dataBase64: '', + mimeType: 'image/png', + path: '', + placeholder: true, + note: 'screenshot capture failed during desktop_observe', + capturedAt: new Date().toISOString(), + } +} + +function createEmptyWindowObservation(): WindowObservation { + return { + windows: [], + observedAt: new Date().toISOString(), + } +} diff --git a/services/computer-use-mcp/src/policy.ts b/services/computer-use-mcp/src/policy.ts index 5625e2dfbe..b043075134 100644 --- a/services/computer-use-mcp/src/policy.ts +++ b/services/computer-use-mcp/src/policy.ts @@ -15,7 +15,7 @@ function isMutatingAction(action: ActionInvocation) { } function isUiInteractionAction(action: ActionInvocation) { - return ['click', 'type_text', 'press_keys', 'scroll', 'open_app', 'focus_app'].includes(action.kind) + return ['click', 'desktop_click_target', 'type_text', 'press_keys', 'scroll', 'open_app', 'focus_app'].includes(action.kind) } function getCoordinate(action: ActionInvocation) { @@ -52,6 +52,7 @@ function estimateOperationUnits(action: ActionInvocation) { case 'clipboard_write_text': return Math.max(2, Math.ceil(action.input.text.length / 64)) case 'click': + case 'desktop_click_target': return 1 case 'type_text': return Math.max(2, Math.ceil(action.input.text.length / 48)) @@ -194,7 +195,7 @@ export function evaluateActionPolicy(params: { riskLevel = 'high' } - if (params.action.kind === 'click' || params.action.kind === 'press_keys' || params.action.kind === 'scroll') { + if (params.action.kind === 'click' || params.action.kind === 'desktop_click_target' || params.action.kind === 'press_keys' || params.action.kind === 'scroll') { riskLevel = 'medium' } diff --git a/services/computer-use-mcp/src/server.ts b/services/computer-use-mcp/src/server.ts index 8ad46f6bc2..d3a48bab83 100644 --- a/services/computer-use-mcp/src/server.ts +++ b/services/computer-use-mcp/src/server.ts @@ -9,6 +9,7 @@ import { resolveComputerUseConfig } from './config' import { createExecuteAction } from './server/action-executor' import { registerAccessibilityTools } from './server/register-accessibility' import { registerCdpTools } from './server/register-cdp' +import { registerDesktopGroundingTools } from './server/register-desktop-grounding' import { registerDisplayTools } from './server/register-display' import { destroyAllPtySessions, registerPtyTools } from './server/register-pty' import { registerTaskMemoryTools } from './server/register-task-memory' @@ -49,6 +50,7 @@ export async function createComputerUseMcpServer(config = resolveComputerUseConf }, }) const cdpCleanup = registerCdpTools({ server, runtime }) + registerDesktopGroundingTools({ server, runtime, executeAction }) return { server, diff --git a/services/computer-use-mcp/src/server/action-executor.test.ts b/services/computer-use-mcp/src/server/action-executor.test.ts index 51962689a6..ee942eff78 100644 --- a/services/computer-use-mcp/src/server/action-executor.test.ts +++ b/services/computer-use-mcp/src/server/action-executor.test.ts @@ -6,89 +6,101 @@ import { RunStateManager } from '../state' import { createDisplayInfo, createLocalExecutionTarget, createTerminalState, createTestConfig } from '../test-fixtures' import { createExecuteAction } from './action-executor' +function createRuntimeForActionTest() { + const stateManager = new RunStateManager() + const session = { + listPendingActions: vi.fn().mockReturnValue([]), + getBudgetState: vi.fn().mockReturnValue({ + operationsExecuted: 0, + operationUnitsConsumed: 0, + }), + record: vi.fn().mockResolvedValue(undefined), + createPendingAction: vi.fn(), + consumeOperation: vi.fn(), + getLastScreenshot: vi.fn().mockReturnValue(undefined), + setLastScreenshot: vi.fn(), + getTerminalState: vi.fn().mockReturnValue(createTerminalState()), + setTerminalState: vi.fn(), + getPointerPosition: vi.fn().mockReturnValue(undefined), + setPointerPosition: vi.fn(), + } + const executor = { + kind: 'dry-run' as const, + describe: () => ({ kind: 'dry-run' as const, notes: [] }), + getExecutionTarget: vi.fn().mockResolvedValue(createLocalExecutionTarget()), + getForegroundContext: vi.fn().mockResolvedValue({ + available: true, + appName: 'Google Chrome', + platform: 'darwin', + }), + getDisplayInfo: vi.fn().mockResolvedValue(createDisplayInfo({ + platform: 'darwin', + })), + getPermissionInfo: vi.fn(), + observeWindows: vi.fn(), + takeScreenshot: vi.fn(), + openApp: vi.fn(), + focusApp: vi.fn(), + click: vi.fn().mockResolvedValue({ + performed: true, + backend: 'dry-run' as const, + notes: [], + }), + typeText: vi.fn(), + pressKeys: vi.fn(), + scroll: vi.fn(), + wait: vi.fn(), + } + const terminalRunner = { + describe: () => ({ kind: 'local-shell-runner' as const, notes: [] }), + execute: vi.fn(), + getState: vi.fn().mockReturnValue(createTerminalState()), + resetState: vi.fn(), + } + const browserDomBridge = { + getStatus: vi.fn().mockReturnValue({ + enabled: true, + host: '127.0.0.1', + port: 8765, + connected: true, + pendingRequests: 0, + }), + } + const cdpBridgeManager = { + probeAvailability: vi.fn().mockResolvedValue({ + endpoint: 'http://localhost:9222', + connected: false, + connectable: true, + }), + } + + const runtime = { + config: createTestConfig({ + executor: 'dry-run', + approvalMode: 'never', + defaultCaptureAfter: false, + }), + session, + executor, + terminalRunner, + browserDomBridge, + cdpBridgeManager, + stateManager, + taskMemory: {}, + } as unknown as ComputerUseServerRuntime + + return { + runtime, + session, + executor, + cdpBridgeManager, + stateManager, + } +} + describe('createExecuteAction', () => { it('refreshes browser surface availability for direct actions before evaluating strategy', async () => { - const stateManager = new RunStateManager() - const session = { - listPendingActions: vi.fn().mockReturnValue([]), - getBudgetState: vi.fn().mockReturnValue({ - operationsExecuted: 0, - operationUnitsConsumed: 0, - }), - record: vi.fn().mockResolvedValue(undefined), - createPendingAction: vi.fn(), - consumeOperation: vi.fn(), - getLastScreenshot: vi.fn().mockReturnValue(undefined), - setLastScreenshot: vi.fn(), - getTerminalState: vi.fn().mockReturnValue(createTerminalState()), - setTerminalState: vi.fn(), - getPointerPosition: vi.fn().mockReturnValue(undefined), - setPointerPosition: vi.fn(), - } - const executor = { - kind: 'dry-run' as const, - describe: () => ({ kind: 'dry-run' as const, notes: [] }), - getExecutionTarget: vi.fn().mockResolvedValue(createLocalExecutionTarget()), - getForegroundContext: vi.fn().mockResolvedValue({ - available: true, - appName: 'Google Chrome', - platform: 'darwin', - }), - getDisplayInfo: vi.fn().mockResolvedValue(createDisplayInfo({ - platform: 'darwin', - })), - getPermissionInfo: vi.fn(), - observeWindows: vi.fn(), - takeScreenshot: vi.fn(), - openApp: vi.fn(), - focusApp: vi.fn(), - click: vi.fn().mockResolvedValue({ - performed: true, - backend: 'dry-run' as const, - notes: [], - }), - typeText: vi.fn(), - pressKeys: vi.fn(), - scroll: vi.fn(), - wait: vi.fn(), - } - const terminalRunner = { - describe: () => ({ kind: 'local-shell-runner' as const, notes: [] }), - execute: vi.fn(), - getState: vi.fn().mockReturnValue(createTerminalState()), - resetState: vi.fn(), - } - const browserDomBridge = { - getStatus: vi.fn().mockReturnValue({ - enabled: true, - host: '127.0.0.1', - port: 8765, - connected: true, - pendingRequests: 0, - }), - } - const cdpBridgeManager = { - probeAvailability: vi.fn().mockResolvedValue({ - endpoint: 'http://localhost:9222', - connected: false, - connectable: true, - }), - } - - const runtime = { - config: createTestConfig({ - executor: 'dry-run', - approvalMode: 'never', - defaultCaptureAfter: false, - }), - session, - executor, - terminalRunner, - browserDomBridge, - cdpBridgeManager, - stateManager, - taskMemory: {}, - } as unknown as ComputerUseServerRuntime + const { runtime, cdpBridgeManager } = createRuntimeForActionTest() const executeAction = createExecuteAction(runtime) const result = await executeAction({ kind: 'click', input: { x: 10, y: 20, captureAfter: false } }, 'desktop_click') @@ -107,4 +119,117 @@ describe('createExecuteAction', () => { reason: expect.stringContaining('extension DOM stack is preferred'), })) }) + + it('updates pointer state only after desktop_click_target executes successfully', async () => { + const { runtime, executor, session, stateManager } = createRuntimeForActionTest() + stateManager.updateGroundingSnapshot({ + snapshotId: 'dg_1', + capturedAt: new Date().toISOString(), + foregroundApp: 'Google Chrome', + windows: [], + screenshot: { dataBase64: '', mimeType: 'image/png', path: '', capturedAt: new Date().toISOString() }, + targetCandidates: [{ + id: 't_0', + source: 'chrome_dom', + appName: 'Google Chrome', + role: 'button', + label: 'Submit', + bounds: { x: 100, y: 200, width: 40, height: 20 }, + confidence: 0.95, + interactable: true, + }], + staleFlags: { screenshot: false, ax: false, chromeSemantic: false }, + } as any) + + const executeAction = createExecuteAction(runtime) + const result = await executeAction({ + kind: 'desktop_click_target', + input: { candidateId: 't_0' }, + }, 'desktop_click_target') + + expect(result.isError).not.toBe(true) + expect(executor.click).toHaveBeenCalledWith(expect.objectContaining({ + x: 120, + y: 210, + button: 'left', + clickCount: 1, + })) + expect(session.setPointerPosition).toHaveBeenCalledWith({ x: 120, y: 210 }) + expect(stateManager.getState().lastClickedCandidateId).toBe('t_0') + expect(stateManager.getState().lastPointerIntent).toMatchObject({ + candidateId: 't_0', + snappedPoint: { x: 120, y: 210 }, + source: 'chrome_dom', + }) + }) + + it('does not mark a candidate as clicked when desktop_click_target execution fails', async () => { + const { runtime, executor, stateManager } = createRuntimeForActionTest() + stateManager.updateGroundingSnapshot({ + snapshotId: 'dg_1', + capturedAt: new Date().toISOString(), + foregroundApp: 'Google Chrome', + windows: [], + screenshot: { dataBase64: '', mimeType: 'image/png', path: '', capturedAt: new Date().toISOString() }, + targetCandidates: [{ + id: 't_0', + source: 'chrome_dom', + appName: 'Google Chrome', + role: 'button', + label: 'Submit', + bounds: { x: 100, y: 200, width: 40, height: 20 }, + confidence: 0.95, + interactable: true, + }], + staleFlags: { screenshot: false, ax: false, chromeSemantic: false }, + } as any) + ;(executor.click as any).mockRejectedValueOnce(new Error('backend failed')) + + const executeAction = createExecuteAction(runtime) + const result = await executeAction({ + kind: 'desktop_click_target', + input: { candidateId: 't_0' }, + }, 'desktop_click_target') + + expect(result.isError).toBe(true) + expect(stateManager.getState().lastClickedCandidateId).toBeUndefined() + expect(stateManager.getState().lastPointerIntent).toBeUndefined() + }) + + it('rejects desktop_click_target when the foreground app changed after desktop_observe', async () => { + const { runtime, executor, stateManager } = createRuntimeForActionTest() + stateManager.updateGroundingSnapshot({ + snapshotId: 'dg_1', + capturedAt: new Date().toISOString(), + foregroundApp: 'Google Chrome', + windows: [], + screenshot: { dataBase64: '', mimeType: 'image/png', path: '', capturedAt: new Date().toISOString() }, + targetCandidates: [{ + id: 't_0', + source: 'chrome_dom', + appName: 'Google Chrome', + role: 'button', + label: 'Submit', + bounds: { x: 100, y: 200, width: 40, height: 20 }, + confidence: 0.95, + interactable: true, + }], + staleFlags: { screenshot: false, ax: false, chromeSemantic: false }, + } as any) + ;(executor.getForegroundContext as any).mockResolvedValue({ + available: true, + appName: 'Terminal', + platform: 'darwin', + }) + + const executeAction = createExecuteAction(runtime) + const result = await executeAction({ + kind: 'desktop_click_target', + input: { candidateId: 't_0' }, + }, 'desktop_click_target') + + expect(result.isError).toBe(true) + expect(executor.click).not.toHaveBeenCalled() + expect(result.content.find(item => item.type === 'text')?.text ?? '').toContain('current foreground app is "Terminal"') + }) }) diff --git a/services/computer-use-mcp/src/server/action-executor.ts b/services/computer-use-mcp/src/server/action-executor.ts index d7e367c946..9ccf8f35c4 100644 --- a/services/computer-use-mcp/src/server/action-executor.ts +++ b/services/computer-use-mcp/src/server/action-executor.ts @@ -1,5 +1,6 @@ import type { CallToolResult } from '@modelcontextprotocol/sdk/types.js' +import type { PointerIntent } from '../desktop-grounding-types' import type { ActionInvocation, ComputerUseConfig, @@ -11,10 +12,12 @@ import type { } from '../types' import type { ComputerUseServerRuntime } from './runtime' -import { normalizeConfiguredAppAction } from '../app-aliases' +import { appNamesMatch, normalizeConfiguredAppAction } from '../app-aliases' +import { DESKTOP_CLICK_SNAPSHOT_MAX_AGE_MS } from '../desktop-grounding-types' import { evaluateActionPolicy } from '../policy' import { getRuntimePreflight } from '../preflight' import { buildCoordinateSpaceInfo } from '../runtime-probes' +import { resolveSnapByCandidate } from '../snap-resolver' import { evaluateStrategy, summarizeAdvisories } from '../strategy' import { buildPointerTrace } from '../trace' import { @@ -115,6 +118,21 @@ function toTerminalStateContent(state: TerminalState) { } } +function isPointWithinAllowedBounds(params: { + x: number + y: number + bounds: ComputerUseConfig['allowedBounds'] +}) { + const { bounds, x, y } = params + if (!bounds) + return true + + return x >= bounds.x + && y >= bounds.y + && x <= (bounds.x + bounds.width) + && y <= (bounds.y + bounds.height) +} + export function createExecuteAction(runtime: ComputerUseServerRuntime): ExecuteAction { return async (action, toolName, options = {}) => { const normalizedAction = normalizeConfiguredAppAction(action, runtime.config.openableApps) @@ -346,6 +364,83 @@ export function createExecuteAction(runtime: ComputerUseServerRuntime): ExecuteA } break } + case 'desktop_click_target': { + const state = runtime.stateManager.getState() + const snapshot = state.lastGroundingSnapshot + if (!snapshot) { + throw new Error('No desktop_observe snapshot available. Call desktop_observe first to get a list of target candidates.') + } + + if (state.lastClickedCandidateId === normalizedAction.input.candidateId) { + throw new Error(`Candidate "${normalizedAction.input.candidateId}" was already clicked. Call desktop_observe again before clicking the same target.`) + } + + const snapshotAge = Date.now() - new Date(snapshot.capturedAt).getTime() + if (snapshotAge > DESKTOP_CLICK_SNAPSHOT_MAX_AGE_MS) { + throw new Error(`Grounding snapshot "${snapshot.snapshotId}" is ${Math.round(snapshotAge / 1000)}s old. Call desktop_observe to get a fresh snapshot before clicking.`) + } + + const currentForeground = await runtime.executor.getForegroundContext() + if (currentForeground.available && currentForeground.appName && !appNamesMatch(currentForeground.appName, snapshot.foregroundApp)) { + throw new Error(`Grounding snapshot "${snapshot.snapshotId}" was captured for "${snapshot.foregroundApp}", but the current foreground app is "${currentForeground.appName}". Call desktop_observe again before clicking.`) + } + + if ( + currentForeground.available + && currentForeground.windowTitle + && snapshot.foregroundWindowTitle + && currentForeground.windowTitle !== snapshot.foregroundWindowTitle + ) { + throw new Error(`Grounding snapshot "${snapshot.snapshotId}" was captured for window "${snapshot.foregroundWindowTitle}", but the current foreground window is "${currentForeground.windowTitle}". Call desktop_observe again before clicking.`) + } + + const snap = resolveSnapByCandidate(normalizedAction.input.candidateId, snapshot) + if (snap.source === 'none' && !snap.candidateId) { + throw new Error(`Candidate "${normalizedAction.input.candidateId}" not found in snapshot "${snapshot.snapshotId}". Available candidates: ${snapshot.targetCandidates.map(c => c.id).join(', ')}`) + } + + if (!isPointWithinAllowedBounds({ + x: snap.snappedPoint.x, + y: snap.snappedPoint.y, + bounds: runtime.config.allowedBounds, + })) { + throw new Error(`Snap-resolved point (${snap.snappedPoint.x}, ${snap.snappedPoint.y}) is outside the allowed bounds.`) + } + + const pointerTrace = buildPointerTrace({ + from: runtime.session.getPointerPosition(), + to: { x: snap.snappedPoint.x, y: snap.snappedPoint.y }, + bounds: runtime.config.allowedBounds, + }) + const result = await runtime.executor.click({ + x: snap.snappedPoint.x, + y: snap.snappedPoint.y, + button: normalizedAction.input.button ?? 'left', + clickCount: normalizedAction.input.clickCount ?? 1, + pointerTrace, + }) + runtime.session.setPointerPosition({ x: snap.snappedPoint.x, y: snap.snappedPoint.y }) + + const candidate = snapshot.targetCandidates.find(c => c.id === normalizedAction.input.candidateId) + const intent: PointerIntent = { + mode: 'execute', + candidateId: normalizedAction.input.candidateId, + rawPoint: snap.rawPoint, + snappedPoint: snap.snappedPoint, + source: snap.source, + confidence: candidate?.confidence ?? 0, + path: pointerTrace, + } + runtime.stateManager.updatePointerIntent(intent, normalizedAction.input.candidateId) + + backendResult = { + ...result, + candidateId: normalizedAction.input.candidateId, + snap, + pointerTrace, + } + break + } case 'type_text': { if (typeof normalizedAction.input.x === 'number' && typeof normalizedAction.input.y === 'number') { const pointerTrace = buildPointerTrace({ diff --git a/services/computer-use-mcp/src/server/register-desktop-grounding-tools.test.ts b/services/computer-use-mcp/src/server/register-desktop-grounding-tools.test.ts new file mode 100644 index 0000000000..ccfcf50e5f --- /dev/null +++ b/services/computer-use-mcp/src/server/register-desktop-grounding-tools.test.ts @@ -0,0 +1,163 @@ +import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js' +import type { CallToolResult } from '@modelcontextprotocol/sdk/types.js' + +import type { ComputerUseServerRuntime } from './runtime' + +import { beforeEach, describe, expect, it, vi } from 'vitest' + +import { RunStateManager } from '../state' +import { createTestConfig } from '../test-fixtures' +import { registerDesktopGroundingTools } from './register-desktop-grounding' + +const { captureDesktopGroundingMock } = vi.hoisted(() => ({ + captureDesktopGroundingMock: vi.fn(), +})) + +vi.mock('../desktop-grounding', async () => { + const actual = await vi.importActual('../desktop-grounding') + return { + ...actual, + captureDesktopGrounding: captureDesktopGroundingMock, + } +}) + +type ToolHandler = (args: Record) => Promise + +function createMockServer() { + const handlers = new Map() + + return { + server: { + tool(name: string, _summary: string, _schema: unknown, handler: ToolHandler) { + handlers.set(name, handler) + return { disable: vi.fn() } + }, + } as unknown as McpServer, + async invoke(name: string, args: Record = {}) { + const handler = handlers.get(name) + if (!handler) { + throw new Error(`Missing registered tool: ${name}`) + } + + return await handler(args) + }, + } +} + +function createRuntime() { + return { + config: createTestConfig(), + stateManager: new RunStateManager(), + cdpBridgeManager: { + getStatus: vi.fn().mockReturnValue({ connected: false }), + ensureBridge: vi.fn(), + }, + browserDomBridge: {}, + executor: {}, + } as unknown as ComputerUseServerRuntime +} + +describe('registerDesktopGroundingTools', () => { + beforeEach(() => { + captureDesktopGroundingMock.mockReset() + }) + + it('routes desktop_click_target through executeAction instead of calling the executor directly', async () => { + const runtime = createRuntime() + const executeAction = vi.fn().mockResolvedValue({ + structuredContent: { status: 'approval_required' }, + content: [{ type: 'text', text: 'approval required' }], + }) + const { server, invoke } = createMockServer() + + registerDesktopGroundingTools({ server, runtime, executeAction }) + + const result = await invoke('desktop_click_target', { + candidateId: 't_0', + clickCount: 2, + button: 'right', + }) + + expect(executeAction).toHaveBeenCalledWith({ + kind: 'desktop_click_target', + input: { + candidateId: 't_0', + clickCount: 2, + button: 'right', + }, + }, 'desktop_click_target') + expect(result).toMatchObject({ + structuredContent: { status: 'approval_required' }, + }) + }) + + it('clears stale grounding state when desktop_observe fails', async () => { + const runtime = createRuntime() + runtime.stateManager.updateGroundingSnapshot({ + snapshotId: 'dg_old', + capturedAt: new Date().toISOString(), + foregroundApp: 'Google Chrome', + windows: [], + screenshot: { dataBase64: '', mimeType: 'image/png', path: '', capturedAt: new Date().toISOString() }, + targetCandidates: [], + staleFlags: { screenshot: false, ax: false, chromeSemantic: false }, + } as any) + captureDesktopGroundingMock.mockRejectedValueOnce(new Error('observe boom')) + + const { server, invoke } = createMockServer() + + registerDesktopGroundingTools({ + server, + runtime, + executeAction: vi.fn(), + }) + + const result = await invoke('desktop_observe', {}) + + expect(result.isError).toBe(true) + expect(runtime.stateManager.getState().lastGroundingSnapshot).toBeUndefined() + expect(runtime.stateManager.getState().lastPointerIntent).toBeUndefined() + expect(runtime.stateManager.getState().lastClickedCandidateId).toBeUndefined() + }) + + it('stores grounding snapshot without screenshot bytes but still returns image content', async () => { + const runtime = createRuntime() + captureDesktopGroundingMock.mockResolvedValueOnce({ + snapshotId: 'dg_new', + capturedAt: new Date().toISOString(), + foregroundApp: 'Google Chrome', + windows: [], + screenshot: { + dataBase64: 'ZmFrZS1wbmc=', + mimeType: 'image/png', + path: '/tmp/shot.png', + capturedAt: new Date().toISOString(), + width: 1280, + height: 720, + }, + targetCandidates: [], + staleFlags: { screenshot: false, ax: false, chromeSemantic: false }, + } as any) + + const { server, invoke } = createMockServer() + + registerDesktopGroundingTools({ + server, + runtime, + executeAction: vi.fn(), + }) + + const result = await invoke('desktop_observe', {}) + const state = runtime.stateManager.getState() + + expect(state.lastGroundingSnapshot?.screenshot.dataBase64).toBe('') + expect(result.content).toEqual([ + expect.objectContaining({ type: 'text' }), + expect.objectContaining({ + type: 'image', + data: 'ZmFrZS1wbmc=', + mimeType: 'image/png', + }), + ]) + }) +}) diff --git a/services/computer-use-mcp/src/server/register-desktop-grounding.test.ts b/services/computer-use-mcp/src/server/register-desktop-grounding.test.ts new file mode 100644 index 0000000000..ce7201d4c0 --- /dev/null +++ b/services/computer-use-mcp/src/server/register-desktop-grounding.test.ts @@ -0,0 +1,256 @@ +import type { + DesktopGroundingSnapshot, + DesktopTargetCandidate, + TargetSource, +} from '../desktop-grounding-types' + +import { describe, expect, it } from 'vitest' + +import { RunStateManager } from '../state' + +// --------------------------------------------------------------------------- +// Test grounding state management through RunStateManager +// (the tools delegate all state to RunStateManager, so we test that interface) +// --------------------------------------------------------------------------- + +function makeCandidate(overrides: Partial = {}): DesktopTargetCandidate { + return { + id: overrides.id ?? 't_0', + source: overrides.source ?? 'chrome_dom', + appName: 'Google Chrome', + role: 'button', + label: 'Submit', + bounds: { x: 100, y: 200, width: 80, height: 30 }, + confidence: 0.95, + interactable: true, + ...overrides, + } +} + +function makeSnapshot(candidates: DesktopTargetCandidate[] = [makeCandidate()]): DesktopGroundingSnapshot { + return { + snapshotId: 'dg_1', + capturedAt: new Date().toISOString(), + foregroundApp: 'Google Chrome', + windows: [], + screenshot: { dataBase64: '', mimeType: 'image/png', path: '', capturedAt: new Date().toISOString() }, + targetCandidates: candidates, + staleFlags: { screenshot: false, ax: false, chromeSemantic: false }, + } as DesktopGroundingSnapshot +} + +describe('runStateManager grounding state', () => { + it('starts with no grounding state', () => { + const sm = new RunStateManager() + const state = sm.getState() + expect(state.lastGroundingSnapshot).toBeUndefined() + expect(state.lastPointerIntent).toBeUndefined() + expect(state.lastClickedCandidateId).toBeUndefined() + }) + + it('stores snapshot via updateGroundingSnapshot', () => { + const sm = new RunStateManager() + const snapshot = makeSnapshot() + sm.updateGroundingSnapshot(snapshot) + + const state = sm.getState() + expect(state.lastGroundingSnapshot).toBe(snapshot) + expect(state.lastClickedCandidateId).toBeUndefined() + }) + + it('resets lastClickedCandidateId on fresh observe', () => { + const sm = new RunStateManager() + sm.updatePointerIntent({ + mode: 'execute', + candidateId: 't_0', + rawPoint: { x: 140, y: 215 }, + snappedPoint: { x: 140, y: 215 }, + source: 'chrome_dom' as TargetSource, + confidence: 0.95, + path: [{ x: 140, y: 215, delayMs: 0 }], + }, 't_0') + + expect(sm.getState().lastClickedCandidateId).toBe('t_0') + + // Fresh observe resets the clicked candidate + sm.updateGroundingSnapshot(makeSnapshot()) + expect(sm.getState().lastClickedCandidateId).toBeUndefined() + }) + + it('stores pointer intent via updatePointerIntent', () => { + const sm = new RunStateManager() + const intent = { + mode: 'execute' as const, + candidateId: 't_1', + rawPoint: { x: 300, y: 200 }, + snappedPoint: { x: 330, y: 213 }, + source: 'chrome_dom' as TargetSource, + confidence: 0.9, + path: [{ x: 330, y: 213, delayMs: 0 }], + } + sm.updatePointerIntent(intent, 't_1') + + const state = sm.getState() + expect(state.lastPointerIntent).toBe(intent) + expect(state.lastClickedCandidateId).toBe('t_1') + }) + + it('clearGroundingState resets everything', () => { + const sm = new RunStateManager() + sm.updateGroundingSnapshot(makeSnapshot()) + sm.updatePointerIntent({ + mode: 'execute', + candidateId: 't_0', + rawPoint: { x: 140, y: 215 }, + snappedPoint: { x: 140, y: 215 }, + source: 'chrome_dom' as TargetSource, + confidence: 0.95, + path: [{ x: 140, y: 215, delayMs: 0 }], + }, 't_0') + + sm.clearGroundingState() + + const state = sm.getState() + expect(state.lastGroundingSnapshot).toBeUndefined() + expect(state.lastPointerIntent).toBeUndefined() + expect(state.lastClickedCandidateId).toBeUndefined() + }) +}) + +describe('desktop_click_target preconditions via RunStateManager', () => { + it('rejects when no snapshot is available', () => { + const sm = new RunStateManager() + const state = sm.getState() + expect(!!state.lastGroundingSnapshot).toBe(false) + }) + + it('rejects duplicate click on same candidate', () => { + const sm = new RunStateManager() + sm.updateGroundingSnapshot(makeSnapshot()) + sm.updatePointerIntent({ + mode: 'execute', + candidateId: 't_0', + rawPoint: { x: 140, y: 215 }, + snappedPoint: { x: 140, y: 215 }, + source: 'chrome_dom' as TargetSource, + confidence: 0.95, + path: [{ x: 140, y: 215, delayMs: 0 }], + }, 't_0') + + expect(sm.getState().lastClickedCandidateId === 't_0').toBe(true) + }) + + it('allows click on different candidate', () => { + const sm = new RunStateManager() + sm.updateGroundingSnapshot(makeSnapshot([ + makeCandidate({ id: 't_0' }), + makeCandidate({ id: 't_1', label: 'Cancel' }), + ])) + sm.updatePointerIntent({ + mode: 'execute', + candidateId: 't_0', + rawPoint: { x: 140, y: 215 }, + snappedPoint: { x: 140, y: 215 }, + source: 'chrome_dom' as TargetSource, + confidence: 0.95, + path: [{ x: 140, y: 215, delayMs: 0 }], + }, 't_0') + + expect(sm.getState().lastClickedCandidateId === 't_1').toBe(false) + }) + + it('allows re-click after re-observe', () => { + const sm = new RunStateManager() + sm.updateGroundingSnapshot(makeSnapshot()) + sm.updatePointerIntent({ + mode: 'execute', + candidateId: 't_0', + rawPoint: { x: 140, y: 215 }, + snappedPoint: { x: 140, y: 215 }, + source: 'chrome_dom' as TargetSource, + confidence: 0.95, + path: [{ x: 140, y: 215, delayMs: 0 }], + }, 't_0') + + // Re-observe resets clicked candidate + sm.updateGroundingSnapshot(makeSnapshot()) + expect(sm.getState().lastClickedCandidateId === 't_0').toBe(false) + }) +}) + +describe('snap resolution integration', () => { + it('resolves candidate by id from snapshot', async () => { + const { resolveSnapByCandidate } = await import('../snap-resolver') + + const snapshot = makeSnapshot([ + makeCandidate({ id: 't_0', bounds: { x: 100, y: 200, width: 80, height: 30 } }), + makeCandidate({ id: 't_1', bounds: { x: 300, y: 200, width: 60, height: 25 }, label: 'Cancel' }), + ]) + + const snap = resolveSnapByCandidate('t_1', snapshot) + expect(snap.candidateId).toBe('t_1') + expect(snap.snappedPoint).toEqual({ x: 330, y: 213 }) + expect(snap.source).toBe('chrome_dom') + }) + + it('returns error for missing candidate', async () => { + const { resolveSnapByCandidate } = await import('../snap-resolver') + const snapshot = makeSnapshot() + + const snap = resolveSnapByCandidate('t_99', snapshot) + expect(snap.source).toBe('none') + expect(snap.reason).toContain('not found') + }) +}) + +describe('overlay polling contract: desktop_get_state exposes grounding data', () => { + it('exposes lastGroundingSnapshot after updateGroundingSnapshot', () => { + const sm = new RunStateManager() + const snapshot = makeSnapshot([ + makeCandidate({ id: 't_0' }), + makeCandidate({ id: 't_1', label: 'Cancel' }), + ]) + + sm.updateGroundingSnapshot(snapshot) + + const state = sm.getState() + expect(state.lastGroundingSnapshot).toBeDefined() + expect(state.lastGroundingSnapshot!.snapshotId).toBe('dg_1') + expect(state.lastGroundingSnapshot!.targetCandidates).toHaveLength(2) + expect(state.lastGroundingSnapshot!.staleFlags).toEqual({ + screenshot: false, + ax: false, + chromeSemantic: false, + }) + }) + + it('exposes lastPointerIntent after updatePointerIntent', () => { + const sm = new RunStateManager() + sm.updateGroundingSnapshot(makeSnapshot()) + sm.updatePointerIntent({ + mode: 'execute', + candidateId: 't_0', + rawPoint: { x: 140, y: 215 }, + snappedPoint: { x: 140, y: 215 }, + source: 'chrome_dom' as TargetSource, + confidence: 0.95, + path: [{ x: 140, y: 215, delayMs: 0 }], + }, 't_0') + + const state = sm.getState() + expect(state.lastPointerIntent).toBeDefined() + expect(state.lastPointerIntent!.candidateId).toBe('t_0') + expect(state.lastPointerIntent!.snappedPoint).toEqual({ x: 140, y: 215 }) + expect(state.lastPointerIntent!.source).toBe('chrome_dom') + expect(state.lastClickedCandidateId).toBe('t_0') + }) + + it('returns stable shape when no grounding state exists', () => { + const sm = new RunStateManager() + + const state = sm.getState() + expect(state.lastGroundingSnapshot).toBeUndefined() + expect(state.lastPointerIntent).toBeUndefined() + expect(state.lastClickedCandidateId).toBeUndefined() + }) +}) diff --git a/services/computer-use-mcp/src/server/register-desktop-grounding.ts b/services/computer-use-mcp/src/server/register-desktop-grounding.ts new file mode 100644 index 0000000000..36ffea084b --- /dev/null +++ b/services/computer-use-mcp/src/server/register-desktop-grounding.ts @@ -0,0 +1,160 @@ +/** + * MCP tool registration for desktop grounding tools: + * - `desktop_observe` — unified observation (screenshot + AX + Chrome semantic) + * - `desktop_click_target` — snap-resolved click by candidate id + * + * These tools work together: the agent first calls `desktop_observe` to get + * a list of interactable target candidates, then uses `desktop_click_target` + * to click on a specific candidate by its id. + * + * State is managed through `runtime.stateManager` (RunStateManager), not + * a private closure. This ensures `desktop_get_state` and the overlay can + * read the latest grounding/pointer data. + */ + +import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js' + +import type { ExecuteAction } from './action-executor' +import type { ComputerUseServerRuntime } from './runtime' + +import process from 'node:process' + +import { z } from 'zod' + +import { captureDesktopGrounding, formatGroundingForAgent } from '../desktop-grounding' +import { textContent } from './content' +import { registerToolWithDescriptor, requireDescriptor } from './tool-descriptors/register-helper' + +/** + * Register desktop grounding MCP tools on the server. + * + * Uses the unified runtime for executor, bridges, and state management. + * Grounding state (snapshot, pointer intent, clicked candidate) flows + * through `runtime.stateManager` so it's visible to `desktop_get_state`, + * the overlay, and strategy rules. + */ +export function registerDesktopGroundingTools(params: { + server: McpServer + runtime: ComputerUseServerRuntime + executeAction: ExecuteAction +}) { + const { server, runtime, executeAction } = params + + // ----------------------------------------------------------------------- + // desktop_observe + // ----------------------------------------------------------------------- + + registerToolWithDescriptor(server, { + descriptor: requireDescriptor('desktop_observe'), + + schema: { + includeChrome: z.boolean().optional().describe('Whether to include Chrome semantic data. Default: auto-detect based on foreground app.'), + }, + + handler: async ({ includeChrome }) => { + try { + // Try to get an existing CDP bridge (non-fatal if unavailable) + let cdpBridge: import('../browser-dom/cdp-bridge').CdpBridge | undefined + try { + const status = runtime.cdpBridgeManager.getStatus() + if (status.connected) { + cdpBridge = await runtime.cdpBridgeManager.ensureBridge() + } + } + catch { + // CDP bridge unavailable — graceful degradation + } + + const snapshot = await captureDesktopGrounding({ + config: runtime.config, + executor: runtime.executor, + input: { includeChrome }, + extensionBridge: runtime.browserDomBridge, + cdpBridge, + }) + + // Update RunState — grounding snapshot + runtime.stateManager.updateGroundingSnapshot({ + ...snapshot, + screenshot: snapshot.screenshot + ? { + ...snapshot.screenshot, + dataBase64: '', + } + : snapshot.screenshot, + }) + + // Also update screenshot state so desktop_get_state and other + // tools can see the latest screenshot from this observation + if (snapshot.screenshot && !snapshot.screenshot.placeholder) { + runtime.stateManager.updateLastScreenshot({ + path: snapshot.screenshot.path || '', + width: snapshot.screenshot.width, + height: snapshot.screenshot.height, + capturedAt: snapshot.screenshot.capturedAt, + placeholder: false, + }) + } + + // Update foreground context from the observation + if (snapshot.foregroundApp && snapshot.foregroundApp !== 'unknown') { + runtime.stateManager.updateForegroundContext({ + available: true, + appName: snapshot.foregroundApp, + platform: process.platform, + }) + } + + const text = formatGroundingForAgent(snapshot) + + // Include screenshot as image content if available + const content: Array<{ type: 'text', text: string } | { type: 'image', data: string, mimeType: 'image/png' }> = [ + { type: 'text', text }, + ] + + if (snapshot.screenshot.dataBase64 && !snapshot.screenshot.placeholder) { + content.push({ + type: 'image', + data: snapshot.screenshot.dataBase64, + mimeType: 'image/png', + }) + } + + return { content } + } + catch (error) { + runtime.stateManager.clearGroundingState() + const message = error instanceof Error ? error.message : String(error) + return { + content: [textContent(`desktop_observe failed: ${message}`)], + isError: true, + } + } + }, + }) + + // ----------------------------------------------------------------------- + // desktop_click_target + // ----------------------------------------------------------------------- + + registerToolWithDescriptor(server, { + descriptor: requireDescriptor('desktop_click_target'), + + schema: { + candidateId: z.string().describe('Target candidate id from the last desktop_observe snapshot (e.g. "t_0")'), + clickCount: z.number().int().min(1).max(3).optional().describe('Number of clicks (default: 1, 2 = double-click)'), + button: z.enum(['left', 'right', 'middle']).optional().describe('Mouse button (default: left)'), + }, + + handler: async ({ candidateId, clickCount, button }) => { + return await executeAction({ + kind: 'desktop_click_target', + input: { + candidateId, + clickCount, + button, + }, + }, 'desktop_click_target') + }, + }) +} diff --git a/services/computer-use-mcp/src/server/tool-descriptors/accessibility.ts b/services/computer-use-mcp/src/server/tool-descriptors/accessibility.ts new file mode 100644 index 0000000000..c8d0586c87 --- /dev/null +++ b/services/computer-use-mcp/src/server/tool-descriptors/accessibility.ts @@ -0,0 +1,34 @@ +/** + * Accessibility Tool Descriptors + */ + +import type { ToolDescriptor } from './types' + +export const accessibilityDescriptors: ToolDescriptor[] = [ + { + canonicalName: 'accessibility_snapshot', + displayName: 'Accessibility Snapshot', + summary: 'Capture the macOS accessibility tree for the frontmost application or a specific process. Returns a hierarchical snapshot of UI elements with roles, titles, values, and optional bounds.', + lane: 'accessibility', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'accessibility_find_element', + displayName: 'Accessibility Find Element', + summary: 'Search the accessibility tree for elements matching a role and/or title pattern. Returns matching elements with their UIDs, roles, titles, values, and bounds.', + lane: 'accessibility', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, +] diff --git a/services/computer-use-mcp/src/server/tool-descriptors/all.ts b/services/computer-use-mcp/src/server/tool-descriptors/all.ts new file mode 100644 index 0000000000..1691ad9fcc --- /dev/null +++ b/services/computer-use-mcp/src/server/tool-descriptors/all.ts @@ -0,0 +1,75 @@ +/** + * Aggregate All Tool Descriptors + * + * This module combines all domain-specific descriptor modules + * and exports a unified registry. + */ + +import type { ToolDescriptor } from './types' + +import { accessibilityDescriptors } from './accessibility' +import { cdpDescriptors } from './cdp' +import { codingDescriptors } from './coding' +import { desktopDescriptors, internalDescriptors, metaDescriptors } from './desktop' +import { displayDescriptors } from './display' +import { ptyDescriptors } from './pty' +import { globalRegistry, ToolDescriptorRegistry } from './registry' +import { taskMemoryDescriptors } from './task-memory' +import { vscodeDescriptors } from './vscode' + +/** + * All public tool descriptors combined. + */ +export const allDescriptors: ToolDescriptor[] = [ + ...accessibilityDescriptors, + ...cdpDescriptors, + ...codingDescriptors, + ...desktopDescriptors, + ...displayDescriptors, + ...ptyDescriptors, + ...taskMemoryDescriptors, + ...vscodeDescriptors, + ...metaDescriptors, +] + +/** + * All descriptors including internal/test tools. + */ +export const allDescriptorsIncludingInternal: ToolDescriptor[] = [ + ...allDescriptors, + ...internalDescriptors, +] + +/** + * Initialize the global registry with all descriptors. + * Call this once at server startup. + */ +export function initializeGlobalRegistry(): ToolDescriptorRegistry { + globalRegistry.clear() + globalRegistry.registerAll(allDescriptorsIncludingInternal) + return globalRegistry +} + +/** + * Create a new registry pre-populated with all descriptors. + * Useful for testing or isolated scenarios. + */ +export function createPopulatedRegistry(): ToolDescriptorRegistry { + const registry = new ToolDescriptorRegistry() + registry.registerAll(allDescriptorsIncludingInternal) + return registry +} + +// Re-export domain descriptors for direct access +export { + accessibilityDescriptors, + cdpDescriptors, + codingDescriptors, + desktopDescriptors, + displayDescriptors, + internalDescriptors, + metaDescriptors, + ptyDescriptors, + taskMemoryDescriptors, + vscodeDescriptors, +} diff --git a/services/computer-use-mcp/src/server/tool-descriptors/cdp.ts b/services/computer-use-mcp/src/server/tool-descriptors/cdp.ts new file mode 100644 index 0000000000..dcc7a1e121 --- /dev/null +++ b/services/computer-use-mcp/src/server/tool-descriptors/cdp.ts @@ -0,0 +1,99 @@ +/** + * CDP (Chrome DevTools Protocol) Tool Descriptors + */ + +import type { ToolDescriptor } from './types' + +export const cdpDescriptors: ToolDescriptor[] = [ + { + canonicalName: 'browser_cdp_connect', + displayName: 'Browser CDP Connect', + summary: 'Connect to a Chrome instance via Chrome DevTools Protocol. Establishes a CDP session for browser automation without requiring a browser extension.', + lane: 'browser_cdp', + kind: 'control', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_cdp_status', + displayName: 'Browser CDP Status', + summary: 'Get the current status of the CDP bridge connection including page URL, title, and connection state.', + lane: 'browser_cdp', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_cdp_accessibility_snapshot', + displayName: 'Browser CDP Accessibility Snapshot', + summary: 'Capture the accessibility tree of the current page via CDP. Returns a hierarchical snapshot of accessible elements.', + lane: 'browser_cdp', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_cdp_evaluate', + displayName: 'Browser CDP Evaluate', + summary: 'Execute JavaScript code in the page context via CDP. Returns the evaluation result.', + lane: 'browser_cdp', + kind: 'write', + readOnly: false, + destructive: true, + concurrencySafe: false, + requiresApprovalByDefault: true, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_cdp_collect_elements', + displayName: 'Browser CDP Collect Elements', + summary: 'Collect interactive elements from the current page via CDP. Returns a list of clickable, focusable, and input elements with their bounds and attributes.', + lane: 'browser_cdp', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_cdp_screenshot', + displayName: 'Browser CDP Screenshot', + summary: 'Take a screenshot of the current page via CDP. Returns the image in PNG or JPEG format.', + lane: 'browser_cdp', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_cdp_navigate', + displayName: 'Browser CDP Navigate', + summary: 'Navigate the browser to a specified URL via CDP. Waits for the page to load.', + lane: 'browser_cdp', + kind: 'write', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: true, + public: true, + defaultDeferred: true, + }, +] diff --git a/services/computer-use-mcp/src/server/tool-descriptors/coding.ts b/services/computer-use-mcp/src/server/tool-descriptors/coding.ts new file mode 100644 index 0000000000..fdccdabb6d --- /dev/null +++ b/services/computer-use-mcp/src/server/tool-descriptors/coding.ts @@ -0,0 +1,237 @@ +/** + * Coding Tool Descriptors + */ + +import type { ToolDescriptor } from './types' + +export const codingDescriptors: ToolDescriptor[] = [ + { + canonicalName: 'coding_review_workspace', + displayName: 'Coding Review Workspace', + summary: 'Analyze the workspace state including git status, project structure, and recent changes. Provides context for code navigation and editing decisions.', + lane: 'coding', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'coding_read_file', + displayName: 'Coding Read File', + summary: 'Read the exact contents of a file with optional line range. Returns raw file content without line numbers so you can copy-paste exact strings for patches.', + lane: 'coding', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + }, + { + canonicalName: 'coding_search_text', + displayName: 'Coding Search Text', + summary: 'Search for text patterns across files in the workspace. Supports glob patterns and result limits.', + lane: 'coding', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + }, + { + canonicalName: 'coding_search_symbol', + displayName: 'Coding Search Symbol', + summary: 'Find symbol declarations (functions, classes, variables) in TypeScript/JavaScript files. Uses language service for accurate results.', + lane: 'coding', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + }, + { + canonicalName: 'coding_find_references', + displayName: 'Coding Find References', + summary: 'Find all references to a symbol at a specific file location. Uses TypeScript language service for accurate cross-file references.', + lane: 'coding', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + }, + { + canonicalName: 'coding_select_target', + displayName: 'Coding Select Target', + summary: 'Deterministically select a target file for editing based on file path, symbol name, or search query. Narrows scope for subsequent operations.', + lane: 'coding', + kind: 'workflow', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'coding_plan_changes', + displayName: 'Coding Plan Changes', + summary: 'Create a bounded coding plan for implementing changes. Limits scope to max 3 files to maintain focus and reduce risk.', + lane: 'coding', + kind: 'workflow', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'coding_analyze_impact', + displayName: 'Coding Analyze Impact', + summary: 'Analyze the 1-hop impact graph of a target file or symbol. Identifies files and symbols that may be affected by changes.', + lane: 'coding', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'coding_validate_hypothesis', + displayName: 'Coding Validate Hypothesis', + summary: 'Validate hypotheses about code behavior against the actual impact graph. Helps catch incorrect assumptions before making changes.', + lane: 'coding', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'coding_diagnose_changes', + displayName: 'Coding Diagnose Changes', + summary: 'Perform root cause analysis when validation or tests fail after changes. Identifies likely sources of problems.', + lane: 'coding', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'coding_capture_validation_baseline', + displayName: 'Coding Capture Validation Baseline', + summary: 'Capture a baseline of current diffs and optionally create a temporary worktree for safe experimentation.', + lane: 'coding', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'coding_review_changes', + displayName: 'Coding Review Changes', + summary: 'Perform a deterministic diff-aware review of changes in the current file. Identifies potential issues and inconsistencies.', + lane: 'coding', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'coding_apply_patch', + displayName: 'Coding Apply Patch', + summary: 'Apply a patch to a file by replacing an old string with a new string. This is a mutating operation that modifies file contents.', + lane: 'coding', + kind: 'write', + readOnly: false, + destructive: true, + concurrencySafe: false, + requiresApprovalByDefault: true, + public: true, + }, + { + canonicalName: 'coding_compress_context', + displayName: 'Coding Compress Context', + summary: 'Summarize the current coding context including goals, file states, and recent results. Useful for managing context window limits.', + lane: 'coding', + kind: 'workflow', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'coding_report_status', + displayName: 'Coding Report Status', + summary: 'Report structured execution status including completion state, summary, affected files, commands run, and validation checks.', + lane: 'coding', + kind: 'workflow', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'coding_write_file', + displayName: 'Coding Write File', + summary: 'Create a new file or overwrite an existing file with provided content. Creates parent directories if they do not exist. For partial edits, use coding_apply_patch instead.', + lane: 'coding', + kind: 'write', + readOnly: false, + destructive: true, + concurrencySafe: false, + requiresApprovalByDefault: true, + public: true, + }, + { + canonicalName: 'coding_list_files', + displayName: 'Coding List Files', + summary: 'List files and directories matching glob patterns within the workspace. Supports include/exclude patterns. Returns relative paths with file/directory indicators.', + lane: 'coding', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + }, + + // Autonomous coding loop + { + canonicalName: 'coding_agentic_run', + displayName: 'Coding Agentic Run', + summary: 'Start an autonomous coding loop to complete a task. The agent will read, search, edit files, and run commands to accomplish the given goal.', + lane: 'coding', + kind: 'write', + readOnly: false, + destructive: true, + concurrencySafe: false, + requiresApprovalByDefault: true, + public: true, + defaultDeferred: true, + }, +] diff --git a/services/computer-use-mcp/src/server/tool-descriptors/desktop.ts b/services/computer-use-mcp/src/server/tool-descriptors/desktop.ts new file mode 100644 index 0000000000..2695579536 --- /dev/null +++ b/services/computer-use-mcp/src/server/tool-descriptors/desktop.ts @@ -0,0 +1,684 @@ +/** + * Desktop, Terminal, and Browser DOM Tool Descriptors + * + * These are the main computer use tools from register-tools.ts + */ + +import type { ToolDescriptor } from './types' + +export const desktopDescriptors: ToolDescriptor[] = [ + // Desktop observation tools + { + canonicalName: 'desktop_get_capabilities', + displayName: 'Desktop Get Capabilities', + summary: 'Get system capabilities, preflight checks, and policy configuration. Returns executor type, permissions, display info, and coordinate space details.', + lane: 'desktop', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'desktop_observe_windows', + displayName: 'Desktop Observe Windows', + summary: 'List visible windows with optional app name filter. Returns window titles, bounds, and PIDs.', + lane: 'desktop', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'desktop_screenshot', + displayName: 'Desktop Screenshot', + summary: 'Capture a screenshot of the current display. Returns the image in base64 format with optional metadata label.', + lane: 'desktop', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'desktop_observe', + displayName: 'Desktop Observe', + summary: 'Capture unified desktop observation: screenshot + window list + AX tree + Chrome semantics (when Chrome is foreground). Returns ranked interactable target candidates with ids, bounds, and source labels.', + lane: 'desktop', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: false, + }, + { + canonicalName: 'desktop_click_target', + displayName: 'Desktop Click Target', + summary: 'Click on a target candidate from the last desktop_observe snapshot using snap-resolved coordinates (chrome_dom > ax > vision > raw priority). Requires a recent desktop_observe call.', + lane: 'desktop', + kind: 'write', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: false, + }, + + // Desktop interaction tools + { + canonicalName: 'desktop_open_app', + displayName: 'Desktop Open App', + summary: 'Open an application. Requires the app to be in COMPUTER_USE_OPENABLE_APPS configuration.', + lane: 'desktop', + kind: 'control', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: true, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'desktop_focus_app', + displayName: 'Desktop Focus App', + summary: 'Focus an application window. Requires the app to be in COMPUTER_USE_OPENABLE_APPS configuration.', + lane: 'desktop', + kind: 'control', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: true, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'desktop_click', + displayName: 'Desktop Click', + summary: 'Click at screen coordinates. Supports left/right/middle buttons and click count for double-click.', + lane: 'desktop', + kind: 'write', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'desktop_type_text', + displayName: 'Desktop Type Text', + summary: 'Type text into the focused element. Optionally click at coordinates first. Long text (>160 chars) requires approval.', + lane: 'desktop', + kind: 'write', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'desktop_press_keys', + displayName: 'Desktop Press Keys', + summary: 'Press a keyboard shortcut or key combination. Some shortcuts (Cmd+Q, Cmd+Space, etc.) are denied by policy.', + lane: 'desktop', + kind: 'write', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'desktop_scroll', + displayName: 'Desktop Scroll', + summary: 'Scroll the screen at optional coordinates. Supports both vertical and horizontal scrolling.', + lane: 'desktop', + kind: 'write', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'desktop_wait', + displayName: 'Desktop Wait', + summary: 'Wait for a specified duration (0-30000ms). Useful for letting UI animations complete.', + lane: 'desktop', + kind: 'control', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'desktop_get_state', + displayName: 'Desktop Get State', + summary: 'Get the current desktop automation state including session status, operation counts, and active context.', + lane: 'desktop', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: false, + }, + { + canonicalName: 'desktop_get_session_trace', + displayName: 'Desktop Get Session Trace', + summary: 'Get the session execution trace including all actions taken, timing, and outcomes.', + lane: 'desktop', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'desktop_list_pending_actions', + displayName: 'Desktop List Pending Actions', + summary: 'List all actions awaiting approval when running in approval-required mode.', + lane: 'desktop', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'desktop_approve_pending_action', + displayName: 'Desktop Approve Pending Action', + summary: 'Approve a pending action by its ID, allowing it to proceed with execution.', + lane: 'desktop', + kind: 'control', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'desktop_reject_pending_action', + displayName: 'Desktop Reject Pending Action', + summary: 'Reject a pending action by its ID, preventing it from executing.', + lane: 'desktop', + kind: 'control', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + + // Terminal tools + { + canonicalName: 'terminal_exec', + displayName: 'Terminal Exec', + summary: 'Execute a shell command in the local terminal. Returns stdout, stderr, exit code, and timing. Always requires approval.', + lane: 'desktop', + kind: 'write', + readOnly: false, + destructive: true, + concurrencySafe: false, + requiresApprovalByDefault: true, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'terminal_reset_state', + displayName: 'Terminal Reset State', + summary: 'Reset the terminal state. Clears accumulated command history and state.', + lane: 'desktop', + kind: 'control', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'terminal_get_state', + displayName: 'Terminal Get State', + summary: 'Get the current terminal execution state including command history and output.', + lane: 'desktop', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + + // Browser DOM tools (extension-based) + { + canonicalName: 'browser_dom_read_page', + displayName: 'Browser DOM Read Page', + summary: 'Read the DOM structure from all frames in the browser via the extension bridge. Returns interactive elements and page structure.', + lane: 'browser_dom', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_dom_find_elements', + displayName: 'Browser DOM Find Elements', + summary: 'Find elements in the browser DOM matching a selector or role. Returns matching elements with bounds and attributes.', + lane: 'browser_dom', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_dom_click', + displayName: 'Browser DOM Click', + summary: 'Click a DOM element matching a CSS selector via the browser extension bridge. Works across multiple frames.', + lane: 'browser_dom', + kind: 'write', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_dom_check_checkbox', + displayName: 'Browser DOM Check Checkbox', + summary: 'Check, uncheck, or toggle a checkbox or radio element via CSS selector in the browser DOM.', + lane: 'browser_dom', + kind: 'write', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_dom_set_input_value', + displayName: 'Browser DOM Set Input Value', + summary: 'Set the value of an input, select, or textarea element. Optionally simulate keystrokes and blur the element.', + lane: 'browser_dom', + kind: 'write', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_dom_select_option', + displayName: 'Browser DOM Select Option', + summary: 'Select an option in a select element by value or visible text via the browser extension bridge.', + lane: 'browser_dom', + kind: 'write', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_dom_trigger_event', + displayName: 'Browser DOM Trigger Event', + summary: 'Dispatch a custom DOM event on an element. Supports multiple event types like click, input, change, and custom events.', + lane: 'browser_dom', + kind: 'write', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_dom_read_input_value', + displayName: 'Browser DOM Read Input Value', + summary: 'Read the current value of an input, select, or textarea element via CSS selector.', + lane: 'browser_dom', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_dom_get_element_attributes', + displayName: 'Browser DOM Get Element Attributes', + summary: 'Get all HTML attributes of a DOM element matching the CSS selector.', + lane: 'browser_dom', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_dom_get_computed_styles', + displayName: 'Browser DOM Get Computed Styles', + summary: 'Retrieve computed CSS styles for an element. Can optionally filter specific CSS properties for layout inspection.', + lane: 'browser_dom', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_dom_wait_for_element', + displayName: 'Browser DOM Wait For Element', + summary: 'Wait for an element matching a CSS selector to appear in the DOM. Useful for handling async UI updates.', + lane: 'browser_dom', + kind: 'control', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_dom_get_active_tab', + displayName: 'Browser DOM Get Active Tab', + summary: 'Get the currently active browser tab with title, URL, and metadata.', + lane: 'browser_dom', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'browser_dom_get_bridge_status', + displayName: 'Browser DOM Get Bridge Status', + summary: 'Get browser DOM bridge connection status including connection state, host, port, and any errors.', + lane: 'browser_dom', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + + // Clipboard tools + { + canonicalName: 'clipboard_read_text', + displayName: 'Clipboard Read Text', + summary: 'Read text content from the system clipboard. Requires approval due to privacy sensitivity.', + lane: 'desktop', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: true, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'clipboard_write_text', + displayName: 'Clipboard Write Text', + summary: 'Write text content to the system clipboard. Requires approval due to data sensitivity.', + lane: 'desktop', + kind: 'write', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: true, + public: true, + defaultDeferred: true, + }, + + // Secret/env tools + { + canonicalName: 'secret_read_env_value', + displayName: 'Secret Read Env Value', + summary: 'Read an environment variable value. Requires approval due to potential exposure of secrets.', + lane: 'desktop', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: true, + public: true, + defaultDeferred: true, + }, + + // Workflow tools + { + canonicalName: 'workflow_browse_and_act', + displayName: 'Workflow Browse And Act', + summary: 'Execute a workflow to browse to a URL, act on the page, and optionally submit data. Combines navigation and interaction.', + lane: 'workflow', + kind: 'workflow', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + }, + { + canonicalName: 'workflow_coding_loop', + displayName: 'Workflow Coding Loop', + summary: 'Execute a coding workflow with iterative plan-act-validate-diagnose-recover cycles.', + lane: 'workflow', + kind: 'workflow', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + }, + { + canonicalName: 'workflow_coding_agentic_loop', + displayName: 'Workflow Coding Agentic Loop', + summary: 'Execute an agentic coding workflow with autonomous decision-making and iteration.', + lane: 'workflow', + kind: 'workflow', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'workflow_open_workspace', + displayName: 'Workflow Open Workspace', + summary: 'Open a workspace and validate it is ready for development work.', + lane: 'workflow', + kind: 'workflow', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'workflow_run_tests', + displayName: 'Workflow Run Tests', + summary: 'Execute test suite and report results. Handles test failures with diagnostics.', + lane: 'workflow', + kind: 'workflow', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'workflow_validate_workspace', + displayName: 'Workflow Validate Workspace', + summary: 'Validate workspace state including dependencies, configuration, and readiness for work.', + lane: 'workflow', + kind: 'workflow', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'workflow_inspect_failure', + displayName: 'Workflow Inspect Failure', + summary: 'Inspect and diagnose a test or build failure. Provides root cause analysis and recovery suggestions.', + lane: 'workflow', + kind: 'workflow', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'workflow_resume', + displayName: 'Workflow Resume', + summary: 'Resume a suspended workflow from its last checkpoint.', + lane: 'workflow', + kind: 'workflow', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'workflow_switch_lane', + displayName: 'Workflow Switch Lane', + summary: 'Request a cross-lane handoff (coding → browser, browser → coding, coding → terminal, terminal → coding). Must declare a typed reason and at least one verification constraint. Denied if route is not in the approved allowlist.', + lane: 'workflow', + kind: 'control', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: true, + public: true, + defaultDeferred: true, + }, +] + +/** + * Internal/test tools (not exposed to MCP clients by default) + */ +export const internalDescriptors: ToolDescriptor[] = [ + { + canonicalName: 'desktop_open_test_target', + displayName: 'Desktop Open Test Target', + summary: 'Open a test target for development/testing purposes. Only available when test tools are enabled.', + lane: 'internal', + kind: 'internal', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: false, + defaultDeferred: true, + }, +] + +/** + * Meta/directory tools + */ +export const metaDescriptors: ToolDescriptor[] = [ + { + canonicalName: 'tool_directory', + displayName: 'Tool Directory', + summary: 'List available tools with optional filtering by lane, kind, or search query. Returns a compact directory for navigation and a structured list for programmatic access.', + lane: 'internal', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'tool_search', + displayName: 'Tool Search', + summary: 'Search tool descriptors by lightweight query and return top ranked candidates without full schema payloads.', + lane: 'internal', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + }, + + // Web tools — HTTP-based information gathering without browser surfaces + { + canonicalName: 'web_fetch', + displayName: 'Web Fetch', + summary: 'Fetch content from a URL via HTTP. Converts HTML to simplified text. No JavaScript execution. For pages requiring login or JS, use browser tools instead.', + lane: 'desktop', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: true, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'web_search', + displayName: 'Web Search', + summary: 'Search the web using a query string. Returns a list of result URLs with titles and snippets. Use web_fetch to read full page content.', + lane: 'desktop', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: true, + public: true, + defaultDeferred: true, + }, +] diff --git a/services/computer-use-mcp/src/server/tool-descriptors/display.ts b/services/computer-use-mcp/src/server/tool-descriptors/display.ts new file mode 100644 index 0000000000..c413816204 --- /dev/null +++ b/services/computer-use-mcp/src/server/tool-descriptors/display.ts @@ -0,0 +1,34 @@ +/** + * Display Tool Descriptors + */ + +import type { ToolDescriptor } from './types' + +export const displayDescriptors: ToolDescriptor[] = [ + { + canonicalName: 'display_enumerate', + displayName: 'Display Enumerate', + summary: 'List all connected displays with their bounds, scale factors, and pixel dimensions. Useful for understanding the coordinate space.', + lane: 'display', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'display_identify_point', + displayName: 'Display Identify Point', + summary: 'Identify which display contains a given coordinate and return the local coordinates within that display.', + lane: 'display', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, +] diff --git a/services/computer-use-mcp/src/server/tool-descriptors/index.ts b/services/computer-use-mcp/src/server/tool-descriptors/index.ts new file mode 100644 index 0000000000..fba4548a53 --- /dev/null +++ b/services/computer-use-mcp/src/server/tool-descriptors/index.ts @@ -0,0 +1,60 @@ +/** + * Tool Descriptors Module + * + * This module provides the unified tool descriptor registry for computer-use-mcp. + * All public MCP tools should have their metadata defined here as the single + * source of truth. + * + * Usage: + * ```typescript + * import { globalRegistry, initializeGlobalRegistry } from './tool-descriptors' + * + * // At server startup + * initializeGlobalRegistry() + * + * // Get a descriptor + * const desc = globalRegistry.get('accessibility_snapshot') + * + * // Query tools + * const readOnlyTools = globalRegistry.query({ readOnlyOnly: true }) + * ``` + */ + +// All descriptors +export { + accessibilityDescriptors, + allDescriptors, + allDescriptorsIncludingInternal, + cdpDescriptors, + codingDescriptors, + createPopulatedRegistry, + desktopDescriptors, + displayDescriptors, + initializeGlobalRegistry, + internalDescriptors, + metaDescriptors, + ptyDescriptors, + taskMemoryDescriptors, + vscodeDescriptors, +} from './all' +// Helpers +export { + getToolKind, + getToolLane, + getToolSummary, + isToolConcurrencySafe, + isToolReadOnly, + registerToolWithDescriptor, + requireDescriptor, + toolInstances, + toolRequiresApprovalByDefault, + validateToolsHaveDescriptors, +} from './register-helper' + +// Registry +export type { ToolQueryOptions } from './registry' +export { globalRegistry, ToolDescriptorRegistry } from './registry' + +// Types +export type { ToolDescriptor, ToolKind, ToolLane } from './types' +export { isToolDescriptor, validateDescriptor } from './types' diff --git a/services/computer-use-mcp/src/server/tool-descriptors/pty.ts b/services/computer-use-mcp/src/server/tool-descriptors/pty.ts new file mode 100644 index 0000000000..fd081d042f --- /dev/null +++ b/services/computer-use-mcp/src/server/tool-descriptors/pty.ts @@ -0,0 +1,96 @@ +/** + * PTY (Pseudo-Terminal) Tool Descriptors + */ + +import type { ToolDescriptor } from './types' + +export const ptyDescriptors: ToolDescriptor[] = [ + { + canonicalName: 'pty_get_status', + displayName: 'PTY Get Status', + summary: 'Get the current status of all PTY sessions including their IDs, PIDs, dimensions, and alive states.', + lane: 'pty', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + }, + { + canonicalName: 'pty_create', + displayName: 'PTY Create', + summary: 'Create a new PTY session with specified dimensions and working directory. Returns the session ID and PID for subsequent operations.', + lane: 'pty', + kind: 'write', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: true, + public: true, + }, + { + canonicalName: 'pty_write', + displayName: 'PTY Write', + summary: 'Write raw data to a PTY session. The data is sent directly to the terminal without interpretation.', + lane: 'pty', + kind: 'write', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: true, + public: true, + }, + { + canonicalName: 'pty_send_input', + displayName: 'PTY Send Input', + summary: 'Send input to a PTY session with optional special key handling (Enter, Ctrl+C, etc.).', + lane: 'pty', + kind: 'write', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: true, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'pty_read_screen', + displayName: 'PTY Read Screen', + summary: 'Read the current screen content from a PTY session. Returns the visible terminal buffer.', + lane: 'pty', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'pty_resize', + displayName: 'PTY Resize', + summary: 'Resize a PTY session to new dimensions. Updates the terminal size for proper text wrapping.', + lane: 'pty', + kind: 'write', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'pty_destroy', + displayName: 'PTY Destroy', + summary: 'Destroy a specific PTY session. Terminates the underlying process and releases resources.', + lane: 'pty', + kind: 'write', + readOnly: false, + destructive: true, + concurrencySafe: false, + requiresApprovalByDefault: true, + public: true, + defaultDeferred: true, + }, +] diff --git a/services/computer-use-mcp/src/server/tool-descriptors/register-helper.ts b/services/computer-use-mcp/src/server/tool-descriptors/register-helper.ts new file mode 100644 index 0000000000..4140bac3a9 --- /dev/null +++ b/services/computer-use-mcp/src/server/tool-descriptors/register-helper.ts @@ -0,0 +1,147 @@ +/** + * Registration Helper + * + * Utilities for descriptor-driven tool registration. + */ + +import type { McpServer, RegisteredTool } from '@modelcontextprotocol/sdk/server/mcp.js' +import type { CallToolResult } from '@modelcontextprotocol/sdk/types.js' +import type { ZodRawShape, ZodTypeAny } from 'zod' + +import type { ToolDescriptor } from './types' + +import { initializeGlobalRegistry } from './all' +import { globalRegistry } from './registry' + +/** + * Registry of instantiated tools on the current server. + */ +export const toolInstances = new Map() + +/** + * Options for descriptor-driven tool registration. + */ +export interface DescriptorToolOptions { + /** + * The tool descriptor (from registry or inline). + */ + descriptor: ToolDescriptor + + /** + * Zod schema for input validation. + */ + schema: TSchema + + /** + * Tool handler function. + */ + handler: (input: { [K in keyof TSchema]: TSchema[K] extends ZodTypeAny ? TSchema[K]['_output'] : never }, extra: unknown) => Promise +} + +/** + * Register a tool using its descriptor. + * The description is automatically taken from the descriptor's summary. + */ +export function registerToolWithDescriptor( + server: McpServer, + options: DescriptorToolOptions, +): RegisteredTool { + const { descriptor, schema, handler } = options + + // Validate descriptor is in registry (fail-closed) + if (!globalRegistry.has(descriptor.canonicalName)) { + throw new Error( + `Tool "${descriptor.canonicalName}" is not registered in the global descriptor registry. ` + + 'All tools must have descriptors registered before use.', + ) + } + + // Register with MCP server + // The description comes from the descriptor's summary + // NOTE: cast required due MCP SDK overload shape not expressing generic descriptor schema here. + const registeredTool = (server.tool as any)( + descriptor.canonicalName, + descriptor.summary, + schema, + handler, + ) as RegisteredTool + + toolInstances.set(descriptor.canonicalName, registeredTool) + + if (descriptor.defaultDeferred && registeredTool?.disable) { + registeredTool.disable() + } + + return registeredTool +} + +/** + * Get descriptor for a tool name, throwing if not found. + */ +export function requireDescriptor(canonicalName: string): ToolDescriptor { + if (globalRegistry.size === 0) { + initializeGlobalRegistry() + } + return globalRegistry.get(canonicalName) +} + +/** + * Get descriptor summary for use in tool registration. + */ +export function getToolSummary(canonicalName: string): string { + return globalRegistry.get(canonicalName).summary +} + +/** + * Check if a tool is read-only according to its descriptor. + */ +export function isToolReadOnly(canonicalName: string): boolean { + return globalRegistry.get(canonicalName).readOnly +} + +/** + * Check if a tool requires approval by default according to its descriptor. + */ +export function toolRequiresApprovalByDefault(canonicalName: string): boolean { + return globalRegistry.get(canonicalName).requiresApprovalByDefault +} + +/** + * Check if a tool is concurrency-safe according to its descriptor. + */ +export function isToolConcurrencySafe(canonicalName: string): boolean { + return globalRegistry.get(canonicalName).concurrencySafe +} + +/** + * Get the lane for a tool. + */ +export function getToolLane(canonicalName: string): string { + return globalRegistry.get(canonicalName).lane +} + +/** + * Get the kind for a tool. + */ +export function getToolKind(canonicalName: string): string { + return globalRegistry.get(canonicalName).kind +} + +/** + * Validate that all tool names have registered descriptors. + * Useful for testing registry completeness. + */ +export function validateToolsHaveDescriptors(toolNames: string[]): { + valid: boolean + missing: string[] + orphans: string[] +} { + const missing = globalRegistry.validateCompleteness(toolNames) + const orphans = globalRegistry.findOrphans(toolNames) + + return { + valid: missing.length === 0, + missing, + orphans, + } +} diff --git a/services/computer-use-mcp/src/server/tool-descriptors/registry.test.ts b/services/computer-use-mcp/src/server/tool-descriptors/registry.test.ts new file mode 100644 index 0000000000..a9e2f19490 --- /dev/null +++ b/services/computer-use-mcp/src/server/tool-descriptors/registry.test.ts @@ -0,0 +1,341 @@ +/** + * Tool Descriptor Registry Tests + */ + +import { readdirSync, readFileSync } from 'node:fs' +import { dirname, resolve } from 'node:path' +import { fileURLToPath } from 'node:url' + +import { describe, expect, it } from 'vitest' + +import { + allDescriptors, + allDescriptorsIncludingInternal, + createPopulatedRegistry, + globalRegistry, + initializeGlobalRegistry, + validateToolsHaveDescriptors, +} from './index' +import { validateDescriptor } from './types' + +describe('toolDescriptorRegistry', () => { + describe('registry initialization', () => { + it('should initialize with all descriptors', () => { + const registry = createPopulatedRegistry() + + expect(registry.size).toBeGreaterThan(0) + expect(registry.size).toBe(allDescriptorsIncludingInternal.length) + }) + + it('should not have duplicate canonical names', () => { + const names = allDescriptorsIncludingInternal.map(d => d.canonicalName) + const uniqueNames = new Set(names) + + expect(uniqueNames.size).toBe(names.length) + }) + + it('should have all required fields in each descriptor', () => { + for (const descriptor of allDescriptorsIncludingInternal) { + expect(() => validateDescriptor(descriptor)).not.toThrow() + } + }) + }) + + describe('public descriptors', () => { + it('should have only public descriptors in allDescriptors', () => { + for (const descriptor of allDescriptors) { + expect(descriptor.public).toBe(true) + } + }) + + it('should have at least 50 public tools', () => { + // We expect at least 50 public tools based on the inventory + expect(allDescriptors.length).toBeGreaterThanOrEqual(50) + }) + }) + + describe('registry query', () => { + it('should query by lane', () => { + const registry = createPopulatedRegistry() + const codingTools = registry.query({ lane: 'coding' }) + + expect(codingTools.length).toBeGreaterThan(0) + for (const tool of codingTools) { + expect(tool.lane).toBe('coding') + } + }) + + it('should query by kind', () => { + const registry = createPopulatedRegistry() + const readTools = registry.query({ kind: 'read' }) + + expect(readTools.length).toBeGreaterThan(0) + for (const tool of readTools) { + expect(tool.kind).toBe('read') + } + }) + + it('should query read-only tools', () => { + const registry = createPopulatedRegistry() + const readOnlyTools = registry.query({ readOnlyOnly: true }) + + expect(readOnlyTools.length).toBeGreaterThan(0) + for (const tool of readOnlyTools) { + expect(tool.readOnly).toBe(true) + } + }) + + it('should query tools requiring approval', () => { + const registry = createPopulatedRegistry() + const approvalTools = registry.query({ approvalRequiredOnly: true }) + + expect(approvalTools.length).toBeGreaterThan(0) + for (const tool of approvalTools) { + expect(tool.requiresApprovalByDefault).toBe(true) + } + }) + + it('should query by text search', () => { + const registry = createPopulatedRegistry() + const screenshotTools = registry.query({ query: 'screenshot' }) + + expect(screenshotTools.length).toBeGreaterThan(0) + for (const tool of screenshotTools) { + const searchable = `${tool.canonicalName} ${tool.displayName} ${tool.summary}`.toLowerCase() + expect(searchable).toContain('screenshot') + } + }) + + it('should combine multiple filters', () => { + const registry = createPopulatedRegistry() + const results = registry.query({ + lane: 'coding', + readOnlyOnly: true, + }) + + expect(results.length).toBeGreaterThan(0) + for (const tool of results) { + expect(tool.lane).toBe('coding') + expect(tool.readOnly).toBe(true) + } + }) + }) + + describe('registry lookup', () => { + it('should get descriptor by canonical name', () => { + const registry = createPopulatedRegistry() + const desc = registry.get('accessibility_snapshot') + + expect(desc.canonicalName).toBe('accessibility_snapshot') + expect(desc.lane).toBe('accessibility') + expect(desc.kind).toBe('read') + }) + + it('should throw for unknown tool', () => { + const registry = createPopulatedRegistry() + + expect(() => registry.get('nonexistent_tool')).toThrow(/Unknown tool/) + }) + + it('should return undefined for optional lookup of unknown tool', () => { + const registry = createPopulatedRegistry() + const desc = registry.getOptional('nonexistent_tool') + + expect(desc).toBeUndefined() + }) + }) + + describe('global registry', () => { + it('should initialize global registry', () => { + initializeGlobalRegistry() + + expect(globalRegistry.size).toBe(allDescriptorsIncludingInternal.length) + }) + + it('should be queryable after initialization', () => { + initializeGlobalRegistry() + const results = globalRegistry.query({ lane: 'desktop' }) + + expect(results.length).toBeGreaterThan(0) + }) + + it('should have zero missing desktop grounding descriptors against register usage', () => { + initializeGlobalRegistry() + + const descriptorsDir = dirname(fileURLToPath(import.meta.url)) + const serverDir = resolve(descriptorsDir, '..') + const registerFiles = readdirSync(serverDir) + .filter(fileName => fileName === 'register-desktop-grounding.ts') + + const toolNames = new Set() + const requireDescriptorPattern = /requireDescriptor\(\s*['"]([^'"]+)['"]\s*\)/g + + for (const fileName of registerFiles) { + const source = readFileSync(resolve(serverDir, fileName), 'utf8') + for (const match of source.matchAll(requireDescriptorPattern)) { + const name = match[1] + if (name) + toolNames.add(name) + } + } + + expect(Array.from(toolNames).sort()).toEqual([ + 'desktop_click_target', + 'desktop_observe', + ]) + + const result = validateToolsHaveDescriptors(Array.from(toolNames)) + + expect(result.valid).toBe(true) + expect(result.missing).toEqual([]) + }) + }) + + describe('descriptor validation', () => { + it('should validate lane values', () => { + const invalidDescriptor = { + canonicalName: 'test_tool', + displayName: 'Test Tool', + summary: 'A test tool', + lane: 'invalid_lane' as const, + kind: 'read' as const, + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + } + + expect(() => validateDescriptor(invalidDescriptor as never)).toThrow(/invalid lane/) + }) + + it('should validate kind values', () => { + const invalidDescriptor = { + canonicalName: 'test_tool', + displayName: 'Test Tool', + summary: 'A test tool', + lane: 'desktop' as const, + kind: 'invalid_kind' as const, + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + } + + expect(() => validateDescriptor(invalidDescriptor as never)).toThrow(/invalid kind/) + }) + }) + + describe('lane coverage', () => { + it('should have descriptors for all expected lanes', () => { + const registry = createPopulatedRegistry() + const expectedLanes = [ + 'desktop', + 'browser_dom', + 'browser_cdp', + 'coding', + 'pty', + 'display', + 'accessibility', + 'task_memory', + 'vscode', + 'workflow', + ] + + const groups = registry.groupByLane() + + for (const lane of expectedLanes) { + const tools = groups.get(lane as never) + expect(tools, `Expected tools for lane: ${lane}`).toBeDefined() + expect(tools!.length, `Expected at least one tool for lane: ${lane}`).toBeGreaterThan(0) + } + }) + }) + + describe('specific tool descriptors', () => { + it('should have correct accessibility_snapshot descriptor', () => { + const registry = createPopulatedRegistry() + const desc = registry.get('accessibility_snapshot') + + expect(desc.lane).toBe('accessibility') + expect(desc.kind).toBe('read') + expect(desc.readOnly).toBe(true) + expect(desc.destructive).toBe(false) + expect(desc.requiresApprovalByDefault).toBe(false) + }) + + it('should have correct terminal_exec descriptor', () => { + const registry = createPopulatedRegistry() + const desc = registry.get('terminal_exec') + + expect(desc.lane).toBe('desktop') + expect(desc.kind).toBe('write') + expect(desc.readOnly).toBe(false) + expect(desc.destructive).toBe(true) + expect(desc.requiresApprovalByDefault).toBe(true) + }) + + it('should have correct coding_apply_patch descriptor', () => { + const registry = createPopulatedRegistry() + const desc = registry.get('coding_apply_patch') + + expect(desc.lane).toBe('coding') + expect(desc.kind).toBe('write') + expect(desc.readOnly).toBe(false) + expect(desc.destructive).toBe(true) + expect(desc.requiresApprovalByDefault).toBe(true) + }) + + it('should have correct tool_directory descriptor', () => { + const registry = createPopulatedRegistry() + const desc = registry.get('tool_directory') + + expect(desc.lane).toBe('internal') + expect(desc.kind).toBe('read') + expect(desc.readOnly).toBe(true) + expect(desc.public).toBe(true) + }) + }) + + describe('desktop grounding tool enablement', () => { + // These 3 core desktop grounding tools must be eagerly enabled (defaultDeferred: false) + // so the overlay can poll desktop_get_state, and agents can call desktop_observe / desktop_click_target + // without needing an explicit enable step. + const eagerlyEnabledTools = [ + 'desktop_get_state', + 'desktop_observe', + 'desktop_click_target', + ] + + for (const toolName of eagerlyEnabledTools) { + it(`should have ${toolName} eagerly enabled (defaultDeferred: false)`, () => { + const registry = createPopulatedRegistry() + const desc = registry.get(toolName) + + expect(desc.defaultDeferred, `${toolName} must NOT be deferred — it is a core grounding tool`).toBeFalsy() + }) + } + + // Other desktop interaction tools must remain deferred to avoid exposing + // the full desktop surface without explicit enablement. + const mustRemainDeferredTools = [ + 'desktop_click', + 'desktop_type_text', + 'desktop_press_keys', + 'desktop_scroll', + 'desktop_open_app', + 'desktop_focus_app', + 'terminal_exec', + ] + + for (const toolName of mustRemainDeferredTools) { + it(`should keep ${toolName} deferred (defaultDeferred: true)`, () => { + const registry = createPopulatedRegistry() + const desc = registry.get(toolName) + + expect(desc.defaultDeferred, `${toolName} must remain deferred`).toBe(true) + }) + } + }) +}) diff --git a/services/computer-use-mcp/src/server/tool-descriptors/registry.ts b/services/computer-use-mcp/src/server/tool-descriptors/registry.ts new file mode 100644 index 0000000000..409eade607 --- /dev/null +++ b/services/computer-use-mcp/src/server/tool-descriptors/registry.ts @@ -0,0 +1,197 @@ +/** + * Tool Descriptor Registry + * + * Central registry for all tool descriptors. Provides lookup, query, + * and validation capabilities. + */ + +import type { ToolDescriptor, ToolKind, ToolLane } from './types' + +import { validateDescriptor } from './types' + +/** + * Filter options for querying tools. + */ +export interface ToolQueryOptions { + lane?: ToolLane + kind?: ToolKind + readOnlyOnly?: boolean + approvalRequiredOnly?: boolean + query?: string +} + +/** + * Tool Descriptor Registry manages all tool descriptors and provides + * lookup, query, and validation capabilities. + */ +export class ToolDescriptorRegistry { + private readonly descriptors: Map = new Map() + + /** + * Register a single descriptor. Validates completeness and uniqueness. + */ + register(descriptor: ToolDescriptor): void { + validateDescriptor(descriptor) + + if (this.descriptors.has(descriptor.canonicalName)) { + throw new Error(`Duplicate tool descriptor: ${descriptor.canonicalName}`) + } + + this.descriptors.set(descriptor.canonicalName, descriptor) + } + + /** + * Register multiple descriptors. + */ + registerAll(descriptors: ToolDescriptor[]): void { + for (const descriptor of descriptors) { + this.register(descriptor) + } + } + + /** + * Get a descriptor by canonical name. + * Throws if not found (fail-closed). + */ + get(canonicalName: string): ToolDescriptor { + const descriptor = this.descriptors.get(canonicalName) + if (!descriptor) { + throw new Error(`Unknown tool: ${canonicalName}. All tools must have registered descriptors.`) + } + return descriptor + } + + /** + * Get a descriptor by canonical name, or undefined if not found. + */ + getOptional(canonicalName: string): ToolDescriptor | undefined { + return this.descriptors.get(canonicalName) + } + + /** + * Check if a tool is registered. + */ + has(canonicalName: string): boolean { + return this.descriptors.has(canonicalName) + } + + /** + * Get all registered descriptors. + */ + getAll(): ToolDescriptor[] { + return Array.from(this.descriptors.values()) + } + + /** + * Get all public descriptors (tools exposed to MCP clients). + */ + getPublic(): ToolDescriptor[] { + return this.getAll().filter(d => d.public) + } + + /** + * Get all canonical names. + */ + getNames(): string[] { + return Array.from(this.descriptors.keys()) + } + + /** + * Get the count of registered descriptors. + */ + get size(): number { + return this.descriptors.size + } + + /** + * Query descriptors with filters. + */ + query(options: ToolQueryOptions = {}): ToolDescriptor[] { + let results = this.getPublic() + + if (options.lane) { + results = results.filter(d => d.lane === options.lane) + } + + if (options.kind) { + results = results.filter(d => d.kind === options.kind) + } + + if (options.readOnlyOnly) { + results = results.filter(d => d.readOnly) + } + + if (options.approvalRequiredOnly) { + results = results.filter(d => d.requiresApprovalByDefault) + } + + if (options.query) { + const queryLower = options.query.toLowerCase() + results = results.filter(d => + d.canonicalName.toLowerCase().includes(queryLower) + || d.displayName.toLowerCase().includes(queryLower) + || d.summary.toLowerCase().includes(queryLower), + ) + } + + return results + } + + /** + * Get descriptors grouped by lane. + */ + groupByLane(): Map { + const groups = new Map() + + for (const descriptor of this.getPublic()) { + const existing = groups.get(descriptor.lane) || [] + existing.push(descriptor) + groups.set(descriptor.lane, existing) + } + + return groups + } + + /** + * Get descriptors grouped by kind. + */ + groupByKind(): Map { + const groups = new Map() + + for (const descriptor of this.getPublic()) { + const existing = groups.get(descriptor.kind) || [] + existing.push(descriptor) + groups.set(descriptor.kind, existing) + } + + return groups + } + + /** + * Validate that all provided tool names have registered descriptors. + * Returns list of missing tool names. + */ + validateCompleteness(toolNames: string[]): string[] { + return toolNames.filter(name => !this.has(name)) + } + + /** + * Find orphan descriptors (registered but not in provided tool names). + */ + findOrphans(toolNames: string[]): string[] { + const toolSet = new Set(toolNames) + return this.getNames().filter(name => !toolSet.has(name)) + } + + /** + * Clear all registered descriptors (for testing). + */ + clear(): void { + this.descriptors.clear() + } +} + +/** + * Global registry instance. + */ +export const globalRegistry = new ToolDescriptorRegistry() diff --git a/services/computer-use-mcp/src/server/tool-descriptors/task-memory.ts b/services/computer-use-mcp/src/server/tool-descriptors/task-memory.ts new file mode 100644 index 0000000000..5d9cf0407d --- /dev/null +++ b/services/computer-use-mcp/src/server/tool-descriptors/task-memory.ts @@ -0,0 +1,47 @@ +/** + * Task Memory Tool Descriptors + */ + +import type { ToolDescriptor } from './types' + +export const taskMemoryDescriptors: ToolDescriptor[] = [ + { + canonicalName: 'task_memory_update', + displayName: 'Task Memory Update', + summary: 'Write or merge task execution state including goal, current step, confirmed facts, artifacts, blockers, and plan.', + lane: 'task_memory', + kind: 'memory', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'task_memory_get', + displayName: 'Task Memory Get', + summary: 'Read the current task memory snapshot. Returns the full task execution state.', + lane: 'task_memory', + kind: 'memory', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'task_memory_clear', + displayName: 'Task Memory Clear', + summary: 'Reset all task memory and execution state. Clears goals, steps, facts, and artifacts.', + lane: 'task_memory', + kind: 'memory', + readOnly: false, + destructive: true, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, +] diff --git a/services/computer-use-mcp/src/server/tool-descriptors/types.ts b/services/computer-use-mcp/src/server/tool-descriptors/types.ts new file mode 100644 index 0000000000..33029c31b4 --- /dev/null +++ b/services/computer-use-mcp/src/server/tool-descriptors/types.ts @@ -0,0 +1,178 @@ +/** + * Tool Descriptor Registry Types + * + * This module defines the canonical types for tool descriptors in the + * computer-use-mcp package. All public MCP tools must have a descriptor + * registered with all required fields (fail-closed policy). + */ + +/** + * Tool lanes represent the domain/subsystem a tool belongs to. + * Each lane groups related tools that operate on the same surface. + */ +export type ToolLane + = | 'desktop' // Desktop automation (click, type, screenshot, etc.) + | 'browser_dom' // Browser DOM via extension bridge + | 'browser_cdp' // Browser via Chrome DevTools Protocol + | 'coding' // Code analysis and editing + | 'pty' // PTY/terminal session management + | 'display' // Display enumeration and identification + | 'accessibility' // Accessibility tree inspection + | 'task_memory' // Task execution state management + | 'vscode' // VS Code CLI automation + | 'workflow' // Workflow orchestration tools + | 'internal' // Internal/diagnostic tools + +/** + * Tool kinds represent the nature of a tool's operation. + */ +export type ToolKind + = | 'read' // Read-only observation (screenshot, status, enumerate) + | 'write' // State mutation (click, type, patch, create) + | 'control' // Control flow (wait, reset, connect) + | 'workflow' // Workflow orchestration (plan, report) + | 'memory' // State/memory management + | 'internal' // Internal diagnostics + +/** + * Tool descriptor defines the canonical metadata for a single MCP tool. + * All fields are required (fail-closed policy). No field may be left undefined. + */ +export interface ToolDescriptor { + /** + * Canonical tool name as registered with MCP server. + * Must match exactly what's passed to `server.tool(name, ...)`. + * Format: snake_case, e.g., 'accessibility_snapshot' + */ + canonicalName: string + + /** + * Human-readable display name. + * Format: Title Case, e.g., 'Accessibility Snapshot' + */ + displayName: string + + /** + * One-sentence description of what the tool does. + * This is used as the MCP tool description. + */ + summary: string + + /** + * The domain/subsystem this tool belongs to. + */ + lane: ToolLane + + /** + * The nature of this tool's operation. + */ + kind: ToolKind + + /** + * Whether this tool only reads state and never mutates it. + * True = safe to call without approval for observation. + */ + readOnly: boolean + + /** + * Whether this tool can cause irreversible changes. + * True = extra caution required (e.g., file deletion, code mutation). + */ + destructive: boolean + + /** + * Whether this tool is safe to run concurrently with other tools. + * False = should be serialized in workflow execution. + */ + concurrencySafe: boolean + + /** + * Whether this tool requires approval by default. + * This is the baseline; stricter rules may still apply. + */ + requiresApprovalByDefault: boolean + + /** + * Whether this tool is exposed to MCP clients. + * False = internal tool not registered with MCP server. + */ + public: boolean + /** + * Whether this tool is hidden from the default tool list to reduce context bloat. + * True = deferred loading (must be explicitly enabled via tool_search). + */ + defaultDeferred?: boolean +} + +/** + * Type guard to check if an object is a valid ToolDescriptor. + */ +export function isToolDescriptor(obj: unknown): obj is ToolDescriptor { + if (typeof obj !== 'object' || obj === null) { + return false + } + + const record = obj as Record + + return ( + typeof record.canonicalName === 'string' + && typeof record.displayName === 'string' + && typeof record.summary === 'string' + && typeof record.lane === 'string' + && typeof record.kind === 'string' + && typeof record.readOnly === 'boolean' + && typeof record.destructive === 'boolean' + && typeof record.concurrencySafe === 'boolean' + && typeof record.requiresApprovalByDefault === 'boolean' + && typeof record.public === 'boolean' + ) +} + +/** + * Validates that a descriptor has all required fields. + * Throws if any field is missing or invalid. + */ +export function validateDescriptor(descriptor: ToolDescriptor): void { + const requiredFields: (keyof ToolDescriptor)[] = [ + 'canonicalName', + 'displayName', + 'summary', + 'lane', + 'kind', + 'readOnly', + 'destructive', + 'concurrencySafe', + 'requiresApprovalByDefault', + 'public', + ] + + for (const field of requiredFields) { + if (descriptor[field] === undefined || descriptor[field] === null) { + throw new Error(`ToolDescriptor "${descriptor.canonicalName || 'unknown'}" is missing required field: ${field}`) + } + } + + // Validate lane + const validLanes: ToolLane[] = [ + 'desktop', + 'browser_dom', + 'browser_cdp', + 'coding', + 'pty', + 'display', + 'accessibility', + 'task_memory', + 'vscode', + 'workflow', + 'internal', + ] + if (!validLanes.includes(descriptor.lane)) { + throw new Error(`ToolDescriptor "${descriptor.canonicalName}" has invalid lane: ${descriptor.lane}`) + } + + // Validate kind + const validKinds: ToolKind[] = ['read', 'write', 'control', 'workflow', 'memory', 'internal'] + if (!validKinds.includes(descriptor.kind)) { + throw new Error(`ToolDescriptor "${descriptor.canonicalName}" has invalid kind: ${descriptor.kind}`) + } +} diff --git a/services/computer-use-mcp/src/server/tool-descriptors/vscode.ts b/services/computer-use-mcp/src/server/tool-descriptors/vscode.ts new file mode 100644 index 0000000000..4556d4fe71 --- /dev/null +++ b/services/computer-use-mcp/src/server/tool-descriptors/vscode.ts @@ -0,0 +1,73 @@ +/** + * VS Code Tool Descriptors + */ + +import type { ToolDescriptor } from './types' + +export const vscodeDescriptors: ToolDescriptor[] = [ + { + canonicalName: 'vscode_resolve_code_cli', + displayName: 'VS Code Resolve CLI', + summary: 'Detect the active VS Code CLI binary (code, code-insiders, or cursor) available in PATH.', + lane: 'vscode', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'vscode_open_workspace', + displayName: 'VS Code Open Workspace', + summary: 'Open a folder in VS Code using the CLI. Supports reusing or creating new windows.', + lane: 'vscode', + kind: 'control', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: true, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'vscode_open_file', + displayName: 'VS Code Open File', + summary: 'Open a file in VS Code at a specific line and column using the --goto option.', + lane: 'vscode', + kind: 'control', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'vscode_run_task', + displayName: 'VS Code Run Task', + summary: 'Run a shell command in VS Code integrated terminal. Returns the command output.', + lane: 'vscode', + kind: 'write', + readOnly: false, + destructive: false, + concurrencySafe: false, + requiresApprovalByDefault: true, + public: true, + defaultDeferred: true, + }, + { + canonicalName: 'vscode_list_problems', + displayName: 'VS Code List Problems', + summary: 'Parse TypeScript diagnostics by running typecheck and extracting errors. Returns structured problem objects.', + lane: 'vscode', + kind: 'read', + readOnly: true, + destructive: false, + concurrencySafe: true, + requiresApprovalByDefault: false, + public: true, + defaultDeferred: true, + }, +] diff --git a/services/computer-use-mcp/src/snap-resolver.test.ts b/services/computer-use-mcp/src/snap-resolver.test.ts new file mode 100644 index 0000000000..448642e365 --- /dev/null +++ b/services/computer-use-mcp/src/snap-resolver.test.ts @@ -0,0 +1,264 @@ +import type { DesktopGroundingSnapshot, DesktopTargetCandidate } from './desktop-grounding-types' + +import { describe, expect, it } from 'vitest' + +import { + boundsArea, + boundsCenter, + boundsIoU, + distanceToBounds, + isPointInBounds, + isStaleCandidateSource, + pointDistance, + resolveSnap, + resolveSnapByCandidate, +} from './snap-resolver' + +// --------------------------------------------------------------------------- +// Helper: minimal snapshot factory +// --------------------------------------------------------------------------- + +function makeSnapshot( + candidates: Partial[], + staleFlags?: Partial, +): DesktopGroundingSnapshot { + return { + snapshotId: 'test_1', + capturedAt: new Date().toISOString(), + foregroundApp: 'Google Chrome', + windows: [], + screenshot: { dataBase64: '', mimeType: 'image/png', path: '' }, + targetCandidates: candidates.map((c, i) => ({ + id: c.id ?? `t_${i}`, + source: c.source ?? 'ax', + appName: c.appName ?? 'Google Chrome', + role: c.role ?? 'AXButton', + label: c.label ?? `Button ${i}`, + bounds: c.bounds ?? { x: 100, y: 100, width: 50, height: 30 }, + confidence: c.confidence ?? 0.8, + interactable: c.interactable ?? true, + })), + staleFlags: { + screenshot: false, + ax: false, + chromeSemantic: false, + ...staleFlags, + }, + } as DesktopGroundingSnapshot +} + +// --------------------------------------------------------------------------- +// Geometry helpers +// --------------------------------------------------------------------------- + +describe('geometry helpers', () => { + it('isPointInBounds: point inside', () => { + expect(isPointInBounds({ x: 125, y: 115 }, { x: 100, y: 100, width: 50, height: 30 })).toBe(true) + }) + + it('isPointInBounds: point on edge', () => { + expect(isPointInBounds({ x: 100, y: 100 }, { x: 100, y: 100, width: 50, height: 30 })).toBe(true) + expect(isPointInBounds({ x: 150, y: 130 }, { x: 100, y: 100, width: 50, height: 30 })).toBe(true) + }) + + it('isPointInBounds: point outside', () => { + expect(isPointInBounds({ x: 99, y: 115 }, { x: 100, y: 100, width: 50, height: 30 })).toBe(false) + expect(isPointInBounds({ x: 151, y: 115 }, { x: 100, y: 100, width: 50, height: 30 })).toBe(false) + }) + + it('boundsCenter computes center', () => { + expect(boundsCenter({ x: 100, y: 200, width: 50, height: 30 })).toEqual({ x: 125, y: 215 }) + }) + + it('boundsArea computes area', () => { + expect(boundsArea({ x: 0, y: 0, width: 10, height: 20 })).toBe(200) + }) + + it('pointDistance computes euclidean distance', () => { + expect(pointDistance({ x: 0, y: 0 }, { x: 3, y: 4 })).toBe(5) + }) + + it('distanceToBounds: inside → 0', () => { + expect(distanceToBounds({ x: 125, y: 115 }, { x: 100, y: 100, width: 50, height: 30 })).toBe(0) + }) + + it('distanceToBounds: outside → positive', () => { + // 10px to the left of bounds + expect(distanceToBounds({ x: 90, y: 115 }, { x: 100, y: 100, width: 50, height: 30 })).toBe(10) + }) + + it('boundsIoU: identical → 1', () => { + const b = { x: 0, y: 0, width: 100, height: 100 } + expect(boundsIoU(b, b)).toBe(1) + }) + + it('boundsIoU: no overlap → 0', () => { + expect(boundsIoU( + { x: 0, y: 0, width: 50, height: 50 }, + { x: 200, y: 200, width: 50, height: 50 }, + )).toBe(0) + }) + + it('boundsIoU: partial overlap', () => { + const iou = boundsIoU( + { x: 0, y: 0, width: 100, height: 100 }, + { x: 50, y: 50, width: 100, height: 100 }, + ) + // Intersection: 50x50 = 2500, Union: 10000 + 10000 - 2500 = 17500 + expect(iou).toBeCloseTo(2500 / 17500, 3) + }) +}) + +// --------------------------------------------------------------------------- +// resolveSnap — priority and matching +// --------------------------------------------------------------------------- + +describe('resolveSnap', () => { + it('empty candidates → raw point fallback', () => { + const snap = resolveSnap({ x: 100, y: 100 }, makeSnapshot([])) + expect(snap.source).toBe('none') + expect(snap.snappedPoint).toEqual({ x: 100, y: 100 }) + expect(snap.reason).toContain('no candidates') + }) + + it('point inside ax candidate → snaps to center', () => { + const snap = resolveSnap( + { x: 110, y: 110 }, + makeSnapshot([{ + source: 'ax', + bounds: { x: 100, y: 100, width: 50, height: 30 }, + label: 'OK Button', + }]), + ) + expect(snap.source).toBe('ax') + expect(snap.candidateId).toBe('t_0') + expect(snap.snappedPoint).toEqual({ x: 125, y: 115 }) + expect(snap.reason).toContain('OK Button') + }) + + it('chrome_dom beats ax when both contain point', () => { + const snap = resolveSnap( + { x: 110, y: 110 }, + makeSnapshot([ + { source: 'ax', bounds: { x: 100, y: 100, width: 50, height: 30 }, label: 'AX' }, + { source: 'chrome_dom', bounds: { x: 105, y: 105, width: 40, height: 20 }, label: 'Chrome' }, + ]), + ) + expect(snap.source).toBe('chrome_dom') + expect(snap.candidateId).toBe('t_1') + expect(snap.reason).toContain('Chrome') + }) + + it('prefers smallest containing candidate within same tier', () => { + const snap = resolveSnap( + { x: 120, y: 115 }, + makeSnapshot([ + { source: 'ax', bounds: { x: 50, y: 50, width: 200, height: 200 }, label: 'Big' }, + { source: 'ax', bounds: { x: 110, y: 110, width: 30, height: 20 }, label: 'Small' }, + ]), + ) + expect(snap.candidateId).toBe('t_1') + expect(snap.reason).toContain('Small') + }) + + it('proximity fallback: near but not inside', () => { + const snap = resolveSnap( + { x: 155, y: 115 }, + makeSnapshot([{ + source: 'ax', + bounds: { x: 100, y: 100, width: 50, height: 30 }, + label: 'Near', + }]), + ) + // 155 is 5px to the right of bounds edge (150) + expect(snap.source).toBe('ax') + expect(snap.candidateId).toBe('t_0') + expect(snap.reason).toContain('within') + }) + + it('too far from any candidate → raw fallback', () => { + const snap = resolveSnap( + { x: 500, y: 500 }, + makeSnapshot([{ + source: 'ax', + bounds: { x: 100, y: 100, width: 50, height: 30 }, + label: 'Far Away', + }]), + ) + expect(snap.source).toBe('none') + expect(snap.snappedPoint).toEqual({ x: 500, y: 500 }) + }) + + it('non-interactable candidates are skipped', () => { + const snap = resolveSnap( + { x: 110, y: 110 }, + makeSnapshot([{ + source: 'ax', + bounds: { x: 100, y: 100, width: 50, height: 30 }, + label: 'Disabled', + interactable: false, + }]), + ) + expect(snap.source).toBe('none') + }) +}) + +// --------------------------------------------------------------------------- +// resolveSnapByCandidate +// --------------------------------------------------------------------------- + +describe('resolveSnapByCandidate', () => { + it('valid candidate → snaps to center', () => { + const snap = resolveSnapByCandidate( + 't_0', + makeSnapshot([{ + bounds: { x: 100, y: 100, width: 50, height: 30 }, + label: 'My Button', + }]), + ) + expect(snap.candidateId).toBe('t_0') + expect(snap.snappedPoint).toEqual({ x: 125, y: 115 }) + expect(snap.source).toBe('ax') + expect(snap.reason).toContain('My Button') + }) + + it('missing candidate → error result', () => { + const snap = resolveSnapByCandidate('t_99', makeSnapshot([])) + expect(snap.source).toBe('none') + expect(snap.reason).toContain('not found') + }) + + it('stale candidate source → warning in reason', () => { + const snap = resolveSnapByCandidate( + 't_0', + makeSnapshot( + [{ source: 'chrome_dom', label: 'Stale' }], + { chromeSemantic: true }, + ), + ) + expect(snap.reason).toContain('stale') + }) +}) + +// --------------------------------------------------------------------------- +// isStaleCandidateSource +// --------------------------------------------------------------------------- + +describe('isStaleCandidateSource', () => { + const freshSnapshot = makeSnapshot([]) + const staleSnapshot = makeSnapshot([], { chromeSemantic: true, ax: true }) + + it('chrome_dom → checks chromeSemantic flag', () => { + expect(isStaleCandidateSource('chrome_dom', freshSnapshot)).toBe(false) + expect(isStaleCandidateSource('chrome_dom', staleSnapshot)).toBe(true) + }) + + it('ax → checks ax flag', () => { + expect(isStaleCandidateSource('ax', freshSnapshot)).toBe(false) + expect(isStaleCandidateSource('ax', staleSnapshot)).toBe(true) + }) + + it('raw → never stale', () => { + expect(isStaleCandidateSource('raw', staleSnapshot)).toBe(false) + }) +}) diff --git a/services/computer-use-mcp/src/snap-resolver.ts b/services/computer-use-mcp/src/snap-resolver.ts new file mode 100644 index 0000000000..231f9a4295 --- /dev/null +++ b/services/computer-use-mcp/src/snap-resolver.ts @@ -0,0 +1,287 @@ +/** + * Snap resolver — coordinate snap logic for desktop grounding. + * + * Resolves raw coordinates to the best matching target candidate + * using the priority hierarchy: `chrome_dom > ax > vision > raw`. + * + * The resolver: + * 1. Groups candidates by source tier + * 2. Tries each tier in priority order + * 3. Within each tier, finds the closest candidate whose bounds contain the point + * 4. Falls through to the next tier if no match + * 5. Returns raw point as fallback if nothing matches + */ + +import type { + DesktopGroundingSnapshot, + DesktopTargetCandidate, + SnapResolution, + TargetSource, +} from './desktop-grounding-types' +import type { Bounds } from './types' + +import { TARGET_SOURCE_PRIORITY } from './desktop-grounding-types' + +/** + * Maximum distance (in logical pixels) from a candidate's bounds edge + * to still consider snapping to it. Beyond this, the point is "too far" + * from any candidate and falls back to raw. + */ +const SNAP_PROXIMITY_THRESHOLD_PX = 20 + +/** + * Options for snap resolution. + */ +export interface SnapResolverOptions { + /** Override the proximity threshold for edge-snapping (default: 20px) */ + proximityThresholdPx?: number + /** Only consider candidates from these sources */ + allowedSources?: TargetSource[] +} + +/** + * Resolve a raw point to the best matching target candidate. + * + * Priority: chrome_dom > ax > vision > raw. + * Within each tier, prefers candidates whose bounds contain the point. + * If no containment match, falls back to nearest-center within the + * proximity threshold. + * + * @param point - The raw coordinate to resolve + * @param snapshot - The current desktop grounding snapshot + * @param options - Optional resolution parameters + * @returns The snap resolution with matched candidate and reason + */ +export function resolveSnap( + point: { x: number, y: number }, + snapshot: DesktopGroundingSnapshot, + options: SnapResolverOptions = {}, +): SnapResolution { + const threshold = options.proximityThresholdPx ?? SNAP_PROXIMITY_THRESHOLD_PX + const allowedSources = options.allowedSources ?? [...TARGET_SOURCE_PRIORITY] + const candidates = snapshot.targetCandidates + + if (candidates.length === 0) { + return { + rawPoint: point, + snappedPoint: point, + source: 'none', + reason: 'no candidates available; using raw point', + } + } + + // Try each source tier in priority order + for (const source of TARGET_SOURCE_PRIORITY) { + if (!allowedSources.includes(source)) { + continue + } + + const tierCandidates = candidates.filter(c => c.source === source && c.interactable) + if (tierCandidates.length === 0) { + continue + } + + // 1. Direct containment: point is inside candidate bounds + const containment = tierCandidates.filter(c => isPointInBounds(point, c.bounds)) + if (containment.length > 0) { + // If multiple contain the point, pick the smallest (most specific) + const best = containment.reduce((a, b) => + boundsArea(a.bounds) <= boundsArea(b.bounds) ? a : b, + ) + const center = boundsCenter(best.bounds) + return { + rawPoint: point, + snappedPoint: center, + candidateId: best.id, + source, + reason: `point inside ${source} candidate "${best.label}" bounds; snapped to center`, + } + } + + // 2. Proximity: nearest candidate center within threshold + const nearest = findNearestCandidate(point, tierCandidates, threshold) + if (nearest) { + const center = boundsCenter(nearest.bounds) + return { + rawPoint: point, + snappedPoint: center, + candidateId: nearest.id, + source, + reason: `point within ${threshold}px of ${source} candidate "${nearest.label}"; snapped to center`, + } + } + } + + // No match in any tier → raw fallback + return { + rawPoint: point, + snappedPoint: point, + source: 'none', + reason: `no candidate matched within ${threshold}px; using raw point`, + } +} + +/** + * Resolve a snap by candidate ID directly (for `desktop_click_target`). + * + * Looks up the candidate by id, validates it exists and isn't stale, + * and returns a snap to its center. + * + * @param candidateId - The candidate id from the snapshot + * @param snapshot - The current desktop grounding snapshot + * @returns The snap resolution, or an error result if invalid + */ +export function resolveSnapByCandidate( + candidateId: string, + snapshot: DesktopGroundingSnapshot, +): SnapResolution { + const candidate = snapshot.targetCandidates.find(c => c.id === candidateId) + + if (!candidate) { + return { + rawPoint: { x: 0, y: 0 }, + snappedPoint: { x: 0, y: 0 }, + source: 'none', + reason: `candidate "${candidateId}" not found in snapshot`, + } + } + + if (isStaleCandidateSource(candidate.source, snapshot)) { + const center = boundsCenter(candidate.bounds) + return { + rawPoint: center, + snappedPoint: center, + candidateId, + source: candidate.source, + reason: `WARNING: candidate "${candidateId}" source "${candidate.source}" is stale; proceeding with last-known position`, + } + } + + const center = boundsCenter(candidate.bounds) + return { + rawPoint: center, + snappedPoint: center, + candidateId, + source: candidate.source, + reason: `direct candidate lookup; snapped to ${candidate.source} candidate "${candidate.label}" center`, + } +} + +/** + * Check if a candidate's source is flagged as stale in the snapshot. + */ +export function isStaleCandidateSource( + source: TargetSource, + snapshot: DesktopGroundingSnapshot, +): boolean { + switch (source) { + case 'chrome_dom': + return snapshot.staleFlags.chromeSemantic + case 'ax': + return snapshot.staleFlags.ax + case 'vision': + return snapshot.staleFlags.screenshot + case 'raw': + return false + } +} + +// --------------------------------------------------------------------------- +// Geometry helpers +// --------------------------------------------------------------------------- + +/** Check if a point is inside a bounding rect. */ +export function isPointInBounds( + point: { x: number, y: number }, + bounds: Bounds, +): boolean { + return ( + point.x >= bounds.x + && point.x <= bounds.x + bounds.width + && point.y >= bounds.y + && point.y <= bounds.y + bounds.height + ) +} + +/** Compute the center point of a bounding rect. */ +export function boundsCenter(bounds: Bounds): { x: number, y: number } { + return { + x: Math.round(bounds.x + bounds.width / 2), + y: Math.round(bounds.y + bounds.height / 2), + } +} + +/** Compute the area of a bounding rect. */ +export function boundsArea(bounds: Bounds): number { + return bounds.width * bounds.height +} + +/** Euclidean distance between two points. */ +export function pointDistance( + a: { x: number, y: number }, + b: { x: number, y: number }, +): number { + return Math.sqrt((a.x - b.x) ** 2 + (a.y - b.y) ** 2) +} + +/** + * Compute the minimum distance from a point to any edge of a bounding rect. + * Returns 0 if the point is inside the bounds. + */ +export function distanceToBounds( + point: { x: number, y: number }, + bounds: Bounds, +): number { + const dx = Math.max(bounds.x - point.x, 0, point.x - (bounds.x + bounds.width)) + const dy = Math.max(bounds.y - point.y, 0, point.y - (bounds.y + bounds.height)) + return Math.sqrt(dx * dx + dy * dy) +} + +/** + * Compute the intersection-over-union (IoU) between two bounding rects. + * Used for deduplication of candidates from different sources. + */ +export function boundsIoU(a: Bounds, b: Bounds): number { + const ax2 = a.x + a.width + const ay2 = a.y + a.height + const bx2 = b.x + b.width + const by2 = b.y + b.height + + const interX = Math.max(0, Math.min(ax2, bx2) - Math.max(a.x, b.x)) + const interY = Math.max(0, Math.min(ay2, by2) - Math.max(a.y, b.y)) + const interArea = interX * interY + + if (interArea === 0) + return 0 + + const aArea = a.width * a.height + const bArea = b.width * b.height + return interArea / (aArea + bArea - interArea) +} + +// --------------------------------------------------------------------------- +// Internal helpers +// --------------------------------------------------------------------------- + +/** + * Find the nearest candidate to a point within a distance threshold. + * Uses distance-to-bounds-edge as metric, not center distance. + */ +function findNearestCandidate( + point: { x: number, y: number }, + candidates: DesktopTargetCandidate[], + threshold: number, +): DesktopTargetCandidate | undefined { + let best: DesktopTargetCandidate | undefined + let bestDist = Infinity + + for (const candidate of candidates) { + const dist = distanceToBounds(point, candidate.bounds) + if (dist <= threshold && dist < bestDist) { + best = candidate + bestDist = dist + } + } + + return best +} diff --git a/services/computer-use-mcp/src/state.ts b/services/computer-use-mcp/src/state.ts index d084724fcd..25880713f6 100644 --- a/services/computer-use-mcp/src/state.ts +++ b/services/computer-use-mcp/src/state.ts @@ -177,6 +177,14 @@ export interface RunState { /** High-level task execution state (goal, facts, blockers, next step). */ taskMemory?: TaskMemory + // --- Desktop grounding ------------------------------------------------- + /** Latest unified desktop grounding snapshot captured by `desktop_observe`. */ + lastGroundingSnapshot?: import('./desktop-grounding-types').DesktopGroundingSnapshot + /** Most recent pointer snap intent for overlay rendering. */ + lastPointerIntent?: import('./desktop-grounding-types').PointerIntent + /** Candidate id of the last `desktop_click_target` call for duplicate protection. */ + lastClickedCandidateId?: string + // --- Meta ------------------------------------------------------------- /** ISO timestamp of the last state update. */ updatedAt: string @@ -388,6 +396,37 @@ export class RunStateManager { this.touch() } + // -- Desktop grounding updates ----------------------------------------- + + /** + * Store a fresh desktop grounding snapshot from `desktop_observe`. + * A new observe invalidates the duplicate-click guard. + */ + updateGroundingSnapshot(snapshot: import('./desktop-grounding-types').DesktopGroundingSnapshot): void { + this.state.lastGroundingSnapshot = snapshot + this.state.lastClickedCandidateId = undefined + this.touch() + } + + /** + * Store the last pointer snap intent and clicked candidate id. + */ + updatePointerIntent(intent: import('./desktop-grounding-types').PointerIntent, candidateId: string): void { + this.state.lastPointerIntent = intent + this.state.lastClickedCandidateId = candidateId + this.touch() + } + + /** + * Clear desktop grounding state when the snapshot becomes invalid. + */ + clearGroundingState(): void { + this.state.lastGroundingSnapshot = undefined + this.state.lastPointerIntent = undefined + this.state.lastClickedCandidateId = undefined + this.touch() + } + clearTask() { this.state.activeTask = undefined this.touch() diff --git a/services/computer-use-mcp/src/strategy.ts b/services/computer-use-mcp/src/strategy.ts index 00b1c10162..32da44d99e 100644 --- a/services/computer-use-mcp/src/strategy.ts +++ b/services/computer-use-mcp/src/strategy.ts @@ -37,6 +37,10 @@ export type AdvisoryKind | 'use_browser_surface' | 'use_pty_surface' | 'enumerate_displays_first' + // Desktop grounding advisories + | 'click_likely_duplicate' + | 'observe_first_required' + | 'grounding_stale' /** Broad category for classifying advisories. */ export type AdvisoryCategory = 'prep' | 'reroute' | 'recovery' | 'informational' @@ -93,6 +97,11 @@ export const ADVISORY_CATEGORY_MAP: Record = { wait_and_retry: 'recovery', abort_task: 'recovery', approval_rejected_replan: 'recovery', + click_likely_duplicate: 'recovery', + + // Desktop grounding + observe_first_required: 'prep', + grounding_stale: 'prep', // Informational: no action needed, safe to proceed proceed: 'informational', @@ -114,6 +123,11 @@ export const ADVISORY_SURFACE_MAP: Record = { wait_and_retry: 'none', abort_task: 'none', approval_rejected_replan: 'none', + click_likely_duplicate: 'desktop', + + // Desktop grounding + observe_first_required: 'desktop', + grounding_stale: 'desktop', proceed: 'none', } @@ -372,6 +386,54 @@ export function evaluateStrategy(params: { })) } + // ----------------------------------------------------------------------- + // Rule 12: desktop_click_target requires a fresh grounding snapshot. + // ----------------------------------------------------------------------- + if ( + proposedAction.kind === 'desktop_click_target' + && !state.lastGroundingSnapshot + ) { + advisories.push(advisory({ + kind: 'observe_first_required', + reason: 'No desktop grounding snapshot available. Call desktop_observe first to discover interactable targets.', + suggestedToolName: 'desktop_observe', + })) + } + + // ----------------------------------------------------------------------- + // Rule 13: stale grounding snapshots must be refreshed before clicking. + // ----------------------------------------------------------------------- + if ( + proposedAction.kind === 'desktop_click_target' + && state.lastGroundingSnapshot + ) { + const snapshotAge = Date.now() - new Date(state.lastGroundingSnapshot.capturedAt).getTime() + if (snapshotAge > 5000) { + advisories.push(advisory({ + kind: 'grounding_stale', + reason: `Desktop grounding snapshot is ${Math.round(snapshotAge / 1000)}s old. Refresh with desktop_observe before clicking.`, + suggestedToolName: 'desktop_observe', + })) + } + } + + // ----------------------------------------------------------------------- + // Rule 14: repeated target clicks need a fresh observe in between. + // ----------------------------------------------------------------------- + if ( + proposedAction.kind === 'desktop_click_target' + && state.lastGroundingSnapshot + && state.lastClickedCandidateId + && 'candidateId' in proposedAction.input + && proposedAction.input.candidateId === state.lastClickedCandidateId + ) { + advisories.push(advisory({ + kind: 'click_likely_duplicate', + reason: `Candidate "${state.lastClickedCandidateId}" was already clicked. Call desktop_observe to verify the UI changed before clicking again.`, + suggestedToolName: 'desktop_observe', + })) + } + // If no advisories were emitted, it is safe to proceed. if (advisories.length === 0) { advisories.push(advisory({ diff --git a/services/computer-use-mcp/src/transparency.ts b/services/computer-use-mcp/src/transparency.ts index dc35fe9210..f178e4dfec 100644 --- a/services/computer-use-mcp/src/transparency.ts +++ b/services/computer-use-mcp/src/transparency.ts @@ -36,6 +36,10 @@ export function explainActionIntent(action: ActionInvocation, runState: RunState return `Taking a screenshot to observe the current state of the desktop${taskContext}.` case 'observe_windows': return `Listing visible windows to understand what applications are open${taskContext}.` + case 'desktop_observe': + return `Capturing unified desktop observation (screenshot + AX tree + Chrome semantics)${taskContext}.` + case 'desktop_click_target': + return `Clicking target candidate "${action.input.candidateId}" using snap-resolved coordinates${taskContext}.` case 'open_app': return `Opening "${action.input.app}" because the task requires this application${taskContext}.` case 'focus_app': @@ -137,6 +141,10 @@ export function explainActionOutcome(params: { return 'Screenshot captured successfully. The model can now analyze the current desktop state.' case 'observe_windows': return 'Window list retrieved. The model can now understand which applications are running.' + case 'desktop_observe': + return 'Desktop observation captured successfully. Target candidates have been identified.' + case 'desktop_click_target': + return `Clicked target candidate "${action.input.candidateId}" at snap-resolved coordinates.` case 'open_app': return `"${action.input.app}" has been opened. It should now be available for interaction.` case 'focus_app': @@ -181,6 +189,7 @@ function buildFailureExplanation( case 'type_text': case 'press_keys': case 'scroll': + case 'desktop_click_target': parts.push('Consider taking a screenshot to verify the current UI state before retrying.') break case 'terminal_exec': @@ -354,6 +363,27 @@ export function summarizeRunState(state: RunState): string { parts.push(`Last approval was REJECTED${state.lastRejectionReason ? ` (${state.lastRejectionReason})` : ''}`) } + // Desktop grounding + if (state.lastGroundingSnapshot) { + const grounding = state.lastGroundingSnapshot + const staleMarks: string[] = [] + if (grounding.staleFlags.screenshot) + staleMarks.push('screenshot') + if (grounding.staleFlags.ax) + staleMarks.push('AX') + if (grounding.staleFlags.chromeSemantic) + staleMarks.push('Chrome') + const staleNote = staleMarks.length > 0 ? ` [stale: ${staleMarks.join(', ')}]` : '' + parts.push(`Grounding: ${grounding.snapshotId} (${grounding.targetCandidates.length} candidates)${staleNote}`) + if (grounding.chromeSemanticSnapshot) { + parts.push(` Chrome page: ${grounding.chromeSemanticSnapshot.pageTitle}`) + } + } + if (state.lastPointerIntent) { + const pointer = state.lastPointerIntent + parts.push(`Last pointer: ${pointer.source} → (${pointer.snappedPoint.x}, ${pointer.snappedPoint.y}) candidate=${pointer.candidateId ?? 'none'} conf=${pointer.confidence.toFixed(2)}`) + } + // Task if (state.activeTask) { parts.push(summarizeTaskProgress(state.activeTask)) diff --git a/services/computer-use-mcp/src/types.ts b/services/computer-use-mcp/src/types.ts index e55791f329..c7dfbffdd7 100644 --- a/services/computer-use-mcp/src/types.ts +++ b/services/computer-use-mcp/src/types.ts @@ -7,6 +7,8 @@ export type MouseButton = 'left' | 'right' | 'middle' export type ActionKind = | 'screenshot' | 'observe_windows' + | 'desktop_observe' + | 'desktop_click_target' | 'open_app' | 'focus_app' | 'secret_read_env_value' @@ -223,6 +225,16 @@ export interface ClipboardWriteTextActionInput { text: string } +export interface DesktopObserveInput { + includeChrome?: boolean +} + +export interface DesktopClickTargetInput { + candidateId: string + clickCount?: number + button?: MouseButton +} + export interface ScreenshotRequest { label?: string } @@ -309,6 +321,8 @@ export interface TestTargetLaunchResult { export type ActionInvocation = | { kind: 'screenshot', input: ScreenshotRequest } | { kind: 'observe_windows', input: ObserveWindowsRequest } + | { kind: 'desktop_observe', input: DesktopObserveInput } + | { kind: 'desktop_click_target', input: DesktopClickTargetInput } | { kind: 'open_app', input: OpenAppActionInput } | { kind: 'focus_app', input: FocusAppActionInput } | { kind: 'secret_read_env_value', input: SecretReadEnvValueActionInput } @@ -482,6 +496,7 @@ export interface BrowserDomInteractiveElement { value?: string href?: string placeholder?: string + role?: string disabled?: boolean checked?: boolean visible?: boolean @@ -501,6 +516,12 @@ export interface BrowserDomFrameDom { url?: string title?: string bodyText?: string + frameRect?: { + x: number + y: number + w: number + h: number + } interactiveElements?: BrowserDomInteractiveElement[] }