diff --git a/apps/stage-tamagotchi/electron.vite.config.ts b/apps/stage-tamagotchi/electron.vite.config.ts index 959b3c14c8..53f11d3027 100644 --- a/apps/stage-tamagotchi/electron.vite.config.ts +++ b/apps/stage-tamagotchi/electron.vite.config.ts @@ -133,6 +133,11 @@ export default defineConfig({ alias: { '@proj-airi/server-sdk': resolve(join(import.meta.dirname, '..', '..', 'packages', 'server-sdk', 'src')), '@proj-airi/i18n': resolve(join(import.meta.dirname, '..', '..', 'packages', 'i18n', 'src')), + // NOTICE: the @proj-airi/stage-ui alias resolves to a directory; rolldown + // concatenates sub-paths without a file extension, so bare .ts files at the + // stores/ root (e.g. mcp-tool-bridge.ts) are not found. Add explicit aliases + // for each such file that the renderer imports from @proj-airi/stage-ui. + '@proj-airi/stage-ui/stores/mcp-tool-bridge': resolve(join(import.meta.dirname, '..', '..', 'packages', 'stage-ui', 'src', 'stores', 'mcp-tool-bridge.ts')), '@proj-airi/stage-ui': resolve(join(import.meta.dirname, '..', '..', 'packages', 'stage-ui', 'src')), '@proj-airi/stage-pages': resolve(join(import.meta.dirname, '..', '..', 'packages', 'stage-pages', 'src')), '@proj-airi/stage-shared': resolve(join(import.meta.dirname, '..', '..', 'packages', 'stage-shared', 'src')), diff --git a/apps/stage-tamagotchi/src/main/index.ts b/apps/stage-tamagotchi/src/main/index.ts index 585fa2f922..acf3e30c1d 100644 --- a/apps/stage-tamagotchi/src/main/index.ts +++ b/apps/stage-tamagotchi/src/main/index.ts @@ -207,7 +207,7 @@ app.whenReady().then(async () => { // provider depends on 'windows:desktop-overlay'. injeca.invoke({ dependsOn: { desktopOverlay }, - callback: noop, + callback: () => {}, }) } diff --git a/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-polling.test.ts b/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-polling.test.ts index 9b3cda1e72..b2acaf8939 100644 --- a/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-polling.test.ts +++ b/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-polling.test.ts @@ -1,5 +1,4 @@ -import type { McpCallToolResult } from '@proj-airi/stage-ui/tools/mcp' - +import type { ElectronMcpCallToolResult } from '../../shared/eventa' import type { OverlayState } from './desktop-overlay-polling' import { afterEach, describe, expect, it, vi } from 'vitest' @@ -162,7 +161,7 @@ describe('createOverlayPollController', () => { it('calls tool and delivers state on successful poll', async () => { vi.useFakeTimers() - const mockResult: McpCallToolResult = { + const mockResult: ElectronMcpCallToolResult = { structuredContent: { runState: { lastGroundingSnapshot: { @@ -176,7 +175,7 @@ describe('createOverlayPollController', () => { }, } - const callTool = vi.fn<(name: string) => Promise>() + const callTool = vi.fn<(name: string) => Promise>() .mockResolvedValue(mockResult) const received: OverlayState[] = [] @@ -201,32 +200,10 @@ describe('createOverlayPollController', () => { controller.stop() }) - it('clears the per-call timeout when the tool resolves before the timeout fires', async () => { - vi.useFakeTimers() - - const callTool = vi.fn<(name: string) => Promise>() - .mockResolvedValue({ structuredContent: {} }) - - const controller = createOverlayPollController({ - callTool, - onState: () => {}, - intervalMs: 100, - callTimeoutMs: 500, - }) - - controller.start() - await vi.advanceTimersByTimeAsync(0) - - // Only the next poll should remain scheduled. The per-call timeout must be cleared. - expect(vi.getTimerCount()).toBe(1) - - controller.stop() - }) - it('stops polling after stop() is called', async () => { vi.useFakeTimers() - const callTool = vi.fn<(name: string) => Promise>() + const callTool = vi.fn<(name: string) => Promise>() .mockResolvedValue({ structuredContent: {} }) const controller = createOverlayPollController({ @@ -250,7 +227,7 @@ describe('createOverlayPollController', () => { it('continues polling after a single failure', async () => { vi.useFakeTimers() - const callTool = vi.fn<(name: string) => Promise>() + const callTool = vi.fn<(name: string) => Promise>() .mockRejectedValueOnce(new Error('MCP down')) .mockResolvedValue({ structuredContent: { @@ -292,7 +269,7 @@ describe('createOverlayPollController', () => { it('is a no-op to call start() twice', async () => { vi.useFakeTimers() - const callTool = vi.fn<(name: string) => Promise>() + const callTool = vi.fn<(name: string) => Promise>() .mockResolvedValue({ structuredContent: {} }) const controller = createOverlayPollController({ @@ -313,8 +290,9 @@ describe('createOverlayPollController', () => { it('recovers from a hanging callTool via per-call timeout', async () => { vi.useFakeTimers() - const callTool = vi.fn<(name: string) => Promise>() - .mockImplementationOnce(() => new Promise(() => {})) + // First call hangs forever (simulates startup race when RPC not ready) + const callTool = vi.fn<(name: string) => Promise>() + .mockImplementationOnce(() => new Promise(() => {})) // never resolves .mockResolvedValue({ structuredContent: { runState: { @@ -344,97 +322,15 @@ describe('createOverlayPollController', () => { expect(callTool).toHaveBeenCalledTimes(1) expect(received).toHaveLength(0) - // Advance past the timeout and several fallback windows. The controller - // should allow a bounded recovery retry even though the original invoke - // is still hung in the background. - await vi.advanceTimersByTimeAsync(500) - await vi.advanceTimersByTimeAsync(200) - expect(callTool).toHaveBeenCalledTimes(2) - expect(received).toHaveLength(1) - expect(received[0].snapshotId).toBe('dg_after_timeout') - - controller.stop() - }) - - it('caps outstanding timed-out polls to avoid unbounded buildup', async () => { - vi.useFakeTimers() - - const callTool = vi.fn<(name: string) => Promise>() - .mockImplementation(() => new Promise(() => {})) - - const controller = createOverlayPollController({ - callTool, - onState: () => {}, - intervalMs: 100, - fallbackIntervalMs: 200, - callTimeoutMs: 500, - }) - - controller.start() - - await vi.advanceTimersByTimeAsync(0) - expect(callTool).toHaveBeenCalledTimes(1) - - await vi.advanceTimersByTimeAsync(500) - await vi.advanceTimersByTimeAsync(200) - expect(callTool).toHaveBeenCalledTimes(2) - + // Advance past the 500ms timeout → catch triggers, schedules fallback await vi.advanceTimersByTimeAsync(500) - await vi.advanceTimersByTimeAsync(1000) - expect(callTool).toHaveBeenCalledTimes(2) - - controller.stop() - }) - - it('recovers again once a timed-out hung-call slot lease expires', async () => { - vi.useFakeTimers() - - const callTool = vi.fn<(name: string) => Promise>() - .mockImplementationOnce(() => new Promise(() => {})) - .mockImplementationOnce(() => new Promise(() => {})) - .mockResolvedValue({ - structuredContent: { - runState: { - lastGroundingSnapshot: { - snapshotId: 'dg_after_lease', - targetCandidates: [], - staleFlags: { screenshot: false, ax: false, chromeSemantic: false }, - }, - }, - }, - }) - - const received: OverlayState[] = [] - - const controller = createOverlayPollController({ - callTool, - onState: (state) => { - received.push(state) - }, - intervalMs: 100, - fallbackIntervalMs: 200, - callTimeoutMs: 500, - hungCallLeaseMs: 1000, - }) - - controller.start() - - await vi.advanceTimersByTimeAsync(0) - expect(callTool).toHaveBeenCalledTimes(1) - - await vi.advanceTimersByTimeAsync(500) - await vi.advanceTimersByTimeAsync(200) - expect(callTool).toHaveBeenCalledTimes(2) - - await vi.advanceTimersByTimeAsync(500) - await vi.advanceTimersByTimeAsync(200) - expect(callTool).toHaveBeenCalledTimes(2) expect(received).toHaveLength(0) + // Advance past the 200ms fallback interval → second poll fires and succeeds await vi.advanceTimersByTimeAsync(200) - expect(callTool).toHaveBeenCalledTimes(3) + expect(callTool).toHaveBeenCalledTimes(2) expect(received).toHaveLength(1) - expect(received[0].snapshotId).toBe('dg_after_lease') + expect(received[0].snapshotId).toBe('dg_after_timeout') controller.stop() }) diff --git a/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-polling.ts b/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-polling.ts index 1ee88e8f68..aea728f835 100644 --- a/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-polling.ts +++ b/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay-polling.ts @@ -5,7 +5,7 @@ * without a DOM environment or Vue test-utils. */ -import type { McpCallToolResult } from '@proj-airi/stage-ui/tools/mcp' +import type { ElectronMcpCallToolResult } from '../../shared/eventa' // --------------------------------------------------------------------------- // Types — minimal shapes matching RunState fields the overlay consumes @@ -90,7 +90,7 @@ export function extractOverlayState(runState: Record): OverlayS * Extract runState from an MCP call result. * Returns undefined if the result is an error or has no structured content. */ -export function extractRunStateFromResult(result: McpCallToolResult): Record | undefined { +export function extractRunStateFromResult(result: ElectronMcpCallToolResult): Record | undefined { if (result.isError) return undefined @@ -121,7 +121,7 @@ export interface OverlayPollController { export interface OverlayPollConfig { /** Function to call MCP tool. */ - callTool: (name: string) => Promise + callTool: (name: string) => Promise /** Callback with extracted state on each successful poll. */ onState: (state: OverlayState) => void /** Normal poll interval in ms. Default: 250. */ @@ -130,15 +130,11 @@ export interface OverlayPollConfig { fallbackIntervalMs?: number /** Per-call timeout in ms. Default: 5000. Prevents poll loop hang on startup race. */ callTimeoutMs?: number - /** How long a timed-out background call occupies a recovery slot before we probe again. */ - hungCallLeaseMs?: number } const DEFAULT_INTERVAL = 250 const DEFAULT_FALLBACK_INTERVAL = 500 const DEFAULT_CALL_TIMEOUT = 5000 -const DEFAULT_HUNG_CALL_LEASE = 5000 -const MAX_BACKGROUND_HUNG_CALLS = 2 /** * MCP server name for computer-use-mcp. Matches the key in mcp.json. @@ -152,75 +148,21 @@ export const MCP_TOOL_NAME = 'computer_use::desktop_get_state' export function createOverlayPollController(config: OverlayPollConfig): OverlayPollController { const normalInterval = config.intervalMs ?? DEFAULT_INTERVAL const fallbackInterval = config.fallbackIntervalMs ?? DEFAULT_FALLBACK_INTERVAL - const hungCallLeaseMs = config.hungCallLeaseMs ?? DEFAULT_HUNG_CALL_LEASE let timer: ReturnType | null = null let running = false - let inFlightCall: Promise | null = null - let backgroundHungSlots: Array<{ expiresAt: number }> = [] - - function scheduleNext(nextInterval: number) { - if (running) { - timer = setTimeout(poll, nextInterval) - } - } - - function pruneHungCallSlots(now: number) { - backgroundHungSlots = backgroundHungSlots.filter(slot => slot.expiresAt > now) - } async function poll() { - pruneHungCallSlots(Date.now()) - - if (inFlightCall || backgroundHungSlots.length >= MAX_BACKGROUND_HUNG_CALLS) { - scheduleNext(fallbackInterval) - return - } - let nextInterval = normalInterval - let timeoutId: ReturnType | undefined try { // NOTICE: Wrap callTool with a timeout to prevent the poll loop from // hanging forever if the eventa invoke never resolves (e.g. during // startup when the main-process RPC handlers may not be ready yet). - // NOTICE: The bridge does not expose abort semantics, so a timed-out - // call may still be pending in the background. We therefore track - // timed-out calls as expiring lease slots: the cap bounds how many - // unrecoverable invokes we tolerate at once, while lease expiry still - // lets the overlay probe again after a cooling-off window. - let timedOutSlot: { expiresAt: number } | null = null - const currentCall = config.callTool(MCP_TOOL_NAME) - inFlightCall = currentCall - currentCall.then(() => { - if (timedOutSlot) { - backgroundHungSlots = backgroundHungSlots.filter(slot => slot !== timedOutSlot) - } - else if (inFlightCall === currentCall) { - inFlightCall = null - } - }, () => { - if (timedOutSlot) { - backgroundHungSlots = backgroundHungSlots.filter(slot => slot !== timedOutSlot) - } - else if (inFlightCall === currentCall) { - inFlightCall = null - } - }) - const result = await Promise.race([ - currentCall, + config.callTool(MCP_TOOL_NAME), new Promise((_, reject) => - timeoutId = setTimeout(() => { - timedOutSlot = { - expiresAt: Date.now() + hungCallLeaseMs, - } - backgroundHungSlots = [...backgroundHungSlots, timedOutSlot] - if (inFlightCall === currentCall) { - inFlightCall = null - } - reject(new Error('callTool timeout')) - }, config.callTimeoutMs ?? DEFAULT_CALL_TIMEOUT), + setTimeout(() => reject(new Error('callTool timeout')), config.callTimeoutMs ?? DEFAULT_CALL_TIMEOUT), ), ]) const runState = extractRunStateFromResult(result) @@ -236,13 +178,10 @@ export function createOverlayPollController(config: OverlayPollConfig): OverlayP // MCP server not running, bridge disconnected, or timeout — graceful degradation nextInterval = fallbackInterval } - finally { - if (timeoutId !== undefined) { - clearTimeout(timeoutId) - } - } - scheduleNext(nextInterval) + if (running) { + timer = setTimeout(poll, nextInterval) + } } return { diff --git a/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay.vue b/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay.vue index 852d4e223c..424ed1b3bb 100644 --- a/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay.vue +++ b/apps/stage-tamagotchi/src/renderer/pages/desktop-overlay.vue @@ -25,11 +25,15 @@ import { pointInOverlay, rectIntersectsOverlay, screenRectToLocal, screenToLocal import { createEmptyOverlayState, createOverlayPollController } from './desktop-overlay-polling' // --------------------------------------------------------------------------- -// Overlay window bounds +// Overlay window bounds — read once on mount from main process // --------------------------------------------------------------------------- const getWindowBounds = useElectronEventaInvoke(electron.window.getBounds) -const callMcpTool = useElectronEventaInvoke(electronMcpCallTool) +// Use Eventa invoke for MCP tool calls — McpToolBridge requires a +// setMcpToolBridge() caller that does not exist in the overlay renderer. +// electronMcpCallTool is already wired in setupDesktopOverlayElectronInvokes +// via createMcpServersService, so it works without any extra bootstrap. +const mcpCallTool = useElectronEventaInvoke(electronMcpCallTool) const overlayBounds = ref(null) // --------------------------------------------------------------------------- @@ -66,8 +70,12 @@ const matchedCandidate = computed(() => { return visibleCandidates.value.find(c => c.id === pointerIntent.value!.candidateId) ?? null }) +// --------------------------------------------------------------------------- +// Polling controller +// --------------------------------------------------------------------------- + const controller = createOverlayPollController({ - callTool: name => callMcpTool({ name }), + callTool: async name => mcpCallTool({ name }), onState: (newState) => { state.value = newState }, @@ -123,11 +131,14 @@ const targetBoxStyle = computed(() => { // Lifecycle // --------------------------------------------------------------------------- -async function syncOverlayBounds() { +onMounted(async () => { + // Read overlay window bounds from main process (one-time) try { - overlayBounds.value = await getWindowBounds() + const bounds = await getWindowBounds() + overlayBounds.value = bounds } catch { + // Fallback: assume bounds start at (0,0) with window inner size overlayBounds.value = { x: 0, y: 0, @@ -135,20 +146,11 @@ async function syncOverlayBounds() { height: window.innerHeight, } } -} - -function handleResize() { - void syncOverlayBounds() -} -onMounted(async () => { - await syncOverlayBounds() - window.addEventListener('resize', handleResize) controller.start() }) onUnmounted(() => { - window.removeEventListener('resize', handleResize) controller.stop() }) diff --git a/packages/stage-ui/package.json b/packages/stage-ui/package.json index b498e1f230..1000ba35c4 100644 --- a/packages/stage-ui/package.json +++ b/packages/stage-ui/package.json @@ -39,6 +39,7 @@ "./stores/settings/analytics": "./src/stores/settings/analytics.ts", "./stores/settings": "./src/stores/settings/index.ts", "./stores/modules/vision": "./src/stores/modules/vision/index.ts", + "./stores/mcp-tool-bridge": "./src/stores/mcp-tool-bridge.ts", "./stores/*": "./src/stores/*.ts", "./stores": "./src/stores/index.ts", "./workers/vad": "./src/workers/vad/index.ts", diff --git a/packages/stage-ui/src/stores/mcp-tool-bridge.ts b/packages/stage-ui/src/stores/mcp-tool-bridge.ts new file mode 100644 index 0000000000..c000a92b9a --- /dev/null +++ b/packages/stage-ui/src/stores/mcp-tool-bridge.ts @@ -0,0 +1,50 @@ +/** + * Minimal bridge interface for calling MCP tools from the desktop overlay + * renderer without a direct dependency on the MCP server runtime. + * + * The bridge is set by the Electron main/preload layer (or by a test stub) + * and retrieved by overlay pages that need to invoke computer-use MCP tools. + */ + +export interface McpToolDescriptor { + serverName: string + name: string + toolName: string + description?: string + inputSchema: Record +} + +export interface McpCallToolPayload { + name: string + arguments?: Record +} + +export interface McpCallToolResult { + content?: Array> + structuredContent?: Record + toolResult?: unknown + isError?: boolean +} + +interface McpToolBridge { + listTools: () => Promise + callTool: (payload: McpCallToolPayload) => Promise +} + +let bridge: McpToolBridge | undefined + +export function setMcpToolBridge(nextBridge: McpToolBridge) { + bridge = nextBridge +} + +export function clearMcpToolBridge() { + bridge = undefined +} + +export function getMcpToolBridge(): McpToolBridge { + if (!bridge) { + throw new Error('MCP tool bridge is not available in this runtime.') + } + + return bridge +} diff --git a/services/computer-use-mcp/chrome-extension/README.md b/services/computer-use-mcp/chrome-extension/README.md index dfe332750b..2bc6dfe368 100644 --- a/services/computer-use-mcp/chrome-extension/README.md +++ b/services/computer-use-mcp/chrome-extension/README.md @@ -1,21 +1,21 @@ # AIRI Desktop Grounding — Chrome Extension -Read-only Chrome DOM observation bridge for the AIRI Desktop Grounding layer. +Chrome DOM observation and interaction bridge for the AIRI Desktop Grounding layer. ## What it does - Collects interactive elements (buttons, links, inputs, etc.) from all frames in the active Chrome tab - Reports element positions, ARIA roles, text, and rect coordinates - Feeds this data into the desktop grounding snap resolver for coordinate mapping +- Performs targeted DOM interactions (set input values, check checkboxes, trigger events) when routed by the action executor ## What it does NOT do -- ❌ No DOM mutations (no clicking, typing, scrolling on DOM elements) - ❌ No `eval` / `new Function` / `chrome.scripting.executeScript` - ❌ No external network requests (no Python bridge, no offscreen documents) - ❌ No popup UI -All user interactions are performed via real macOS OS-level input events (CGEvent) through the desktop grounding executor. +Physical click/type/scroll actions are performed via real macOS OS-level input events (CGEvent) through the desktop grounding executor. DOM mutations are limited to form-field writes and synthetic event dispatch via the bridge. ## Architecture @@ -27,6 +27,8 @@ msg_bridge.js (ISOLATED world) content.js (MAIN world, window.__AIRI_DG__) ``` +The background service worker also maintains a native WebSocket connection to `BrowserDomExtensionBridge` (default port 8765) to relay commands from the AIRI host process. + ## Installation (development) 1. Open `chrome://extensions/` @@ -35,23 +37,6 @@ content.js (MAIN world, window.__AIRI_DG__) 4. Select this `chrome-extension/` directory 5. The extension will auto-inject into all pages -## Bridge endpoint override - -By default the background worker connects to `ws://127.0.0.1:8765`. - -If `computer-use-mcp` is running with a non-default -`COMPUTER_USE_BROWSER_DOM_BRIDGE_HOST` or `COMPUTER_USE_BROWSER_DOM_BRIDGE_PORT`, -override the extension endpoint through `chrome.storage.local`: - -```js -await chrome.storage.local.set({ - browserDomBridgeHost: '127.0.0.1', - browserDomBridgePort: 8876, -}) -``` - -The service worker watches these keys and reconnects automatically. - ## Supported commands | Command | Description | @@ -63,7 +48,15 @@ The service worker watches these keys and reconnects automatically. | `findElements` | Find multiple elements by CSS selector | | `getClickTarget` | Get element center point for click targeting | | `getElementAttributes` | Get all attributes of an element | +| `setInputValue` | Set value of a text input or textarea | +| `checkCheckbox` | Check or uncheck a native checkbox/radio | +| `selectOption` | Select an option in a ` element + * - readInputValue: read the current value of an input/textarea/select + * - getComputedStyles: get computed CSS styles for an element + * - triggerEvent: dispatch a DOM event on an element + * - waitForElement: wait for an element to appear in the DOM + * - clickAt: dispatch a click event at viewport coordinates */ async function handleCommand(cmd) { const { action, id } = cmd @@ -289,7 +144,67 @@ async function handleCommand(cmd) { result = await runCUAction(tabId, cmd.frameIds || null, 'getElementAttributes', [cmd.selector || '']) break + case 'setInputValue': + result = await runCUAction(tabId, cmd.frameIds || null, 'setInputValue', [ + cmd.selector || '', + cmd.value || '', + { blur: cmd.opts?.blur !== false, simulateKeystrokes: !!cmd.opts?.simulateKeystrokes }, + ]) + break + + case 'checkCheckbox': + result = await runCUAction(tabId, cmd.frameIds || null, 'checkCheckbox', [ + cmd.selector || '', + cmd.checked, + ]) + break + + case 'selectOption': + result = await runCUAction(tabId, cmd.frameIds || null, 'selectOption', [ + cmd.selector || '', + cmd.value || '', + ]) + break + + case 'readInputValue': + result = await runCUAction(tabId, cmd.frameIds || null, 'readInputValue', [ + cmd.selector || '', + ]) + break + + case 'getComputedStyles': + result = await runCUAction(tabId, cmd.frameIds || null, 'getComputedStyles', [ + cmd.selector || '', + cmd.properties || [], + ]) + break + + case 'triggerEvent': + result = await runCUAction(tabId, cmd.frameIds || null, 'triggerEvent', [ + cmd.selector || '', + cmd.eventName || '', + cmd.opts || {}, + ]) + break + + case 'waitForElement': + result = await runCUAction(tabId, cmd.frameIds || null, 'waitForElement', [ + cmd.selector || '', + cmd.timeoutMs || 5000, + ]) + break + + case 'clickAt': + result = await runCUAction(tabId, cmd.frameIds || null, 'clickAt', [ + cmd.x ?? 0, + cmd.y ?? 0, + ]) + break + default: + // NOTICE: unknown actions must return ok:false so BrowserDomExtensionBridge + // rejects the pending promise; returning ok:true would make callers like + // setInputValue/checkCheckbox see a resolved promise and skip fallback paths. return { id, ok: false, error: `unknown action: ${action}` } } @@ -305,24 +220,6 @@ async function handleCommand(cmd) { // or through the existing WebSocket bridge mechanism chrome.runtime.onMessage.addListener((msg, sender, sendResponse) => { - void connectBridge() - - if (msg.type === 'AIRI_DG_SET_BRIDGE_ENDPOINT') { - saveBridgeConfig(msg.host, msg.port) - .then(() => { - reconnectBridgeNow() - sendResponse({ - ok: true, - host: bridgeHost, - port: bridgePort, - }) - }) - .catch((e) => { - sendResponse({ ok: false, error: e?.message || String(e) }) - }) - return true - } - if (msg.type === 'AIRI_DG_COMMAND') { handleCommand(msg.data) .then(resp => sendResponse(resp)) @@ -330,39 +227,64 @@ chrome.runtime.onMessage.addListener((msg, sender, sendResponse) => { return true // Keep sendResponse async } - // Support the existing ws-incoming format from BrowserDomExtensionBridge - if (msg.type === 'ws-incoming') { - handleCommand(msg.data) - .then((resp) => { - // Send response back via the same channel - chrome.runtime.sendMessage({ type: 'ws-send', data: resp }) - }) - .catch((e) => { - chrome.runtime.sendMessage({ type: 'ws-send', data: { id: msg.data?.id, ok: false, error: String(e) } }) - }) - return false - } - return false }) -chrome.storage.onChanged.addListener((changes, areaName) => { - if (areaName !== 'local') +// ---- WebSocket Relay ---- +// Injects the WebSocket connection directly in the background worker, +// replacing the deleted offscreen document. +// TODO: Add shared-secret auth handshake to prevent rogue localhost processes +// from hijacking the bridge. The bridge server should generate a token and +// inject it into chrome.storage.local so the extension can present it on hello. +const WS_URL = 'ws://localhost:8765' +const BRIDGE_VERSION = 'cu-bridge-2026-02-06-no-eval' +let ws = null +let reconnectDelay = 1000 +const MAX_DELAY = 30000 + +function connectWS() { + if (ws && (ws.readyState === WebSocket.OPEN || ws.readyState === WebSocket.CONNECTING)) return - if (changes[BRIDGE_HOST_STORAGE_KEY] || changes[BRIDGE_PORT_STORAGE_KEY]) { - void loadBridgeConfig().finally(() => { - reconnectBridgeNow() - }) + ws = new WebSocket(WS_URL) + + ws.onopen = () => { + console.log('[background] WebSocket connected') + reconnectDelay = 1000 + ws.send(JSON.stringify({ type: 'hello', source: 'chrome-extension', version: BRIDGE_VERSION })) } -}) -chrome.runtime.onStartup.addListener(() => { - void connectBridge() -}) + ws.onmessage = (evt) => { + try { + const data = JSON.parse(evt.data) + handleCommand(data) + .then((resp) => { + if (ws && ws.readyState === WebSocket.OPEN) { + ws.send(JSON.stringify(resp)) + } + }) + .catch((e) => { + if (ws && ws.readyState === WebSocket.OPEN) { + ws.send(JSON.stringify({ id: data?.id, ok: false, error: String(e) })) + } + }) + } + catch (e) { + console.error('[background] parse error:', e) + } + } -chrome.runtime.onInstalled.addListener(() => { - void connectBridge() -}) + ws.onclose = () => { + console.log(`[background] WebSocket closed, reconnect in ${reconnectDelay}ms`) + ws = null + setTimeout(connectWS, reconnectDelay) + reconnectDelay = Math.min(reconnectDelay * 2, MAX_DELAY) + } + + ws.onerror = (e) => { + console.error('[background] WebSocket error:', e) + ws?.close() + } +} -void connectBridge() +connectWS() diff --git a/services/computer-use-mcp/chrome-extension/content.js b/services/computer-use-mcp/chrome-extension/content.js index cdd5efc6cf..7ebefbc0b5 100644 --- a/services/computer-use-mcp/chrome-extension/content.js +++ b/services/computer-use-mcp/chrome-extension/content.js @@ -4,15 +4,17 @@ * Injected into every frame (including cross-origin iframes) in the MAIN world. * Namespace: window.__AIRI_DG__ * - * IMPORTANT: This script is READ-ONLY. It does NOT perform any DOM mutations, - * clicks, typing, or navigation. All execution is done via real macOS OS-level - * input events through the desktop grounding executor. + * IMPORTANT: Direct DOM mutations here are limited to bridge-triggered write + * actions (setInputValue, checkCheckbox, selectOption) that are only reachable + * via a WebSocket command from the AIRI computer-use-mcp service. Physical + * pointer/keyboard actions still go through real macOS OS-level input. * - * Adapted from /Users/liuziheng/computer_use/chrome-extension/content.js. + * Adapted from the upstream computer-use chrome-extension. * Stripped: clickAt, typeAt, hoverAt, scrollAt, simulateDragDrop, readStorage, - * setStorage, readCanvasData, injectCSS, and all other DOM-mutating methods. + * setStorage, readCanvasData, injectCSS, and all other untracked DOM mutations. * Kept: collectFrameDOM, _describeElement, _collectInteractiveElements, * findElement, findElements, getClickTarget. + * Added: setInputValue, checkCheckbox, selectOption. */ (function () { 'use strict' @@ -37,6 +39,7 @@ name: el.name || '', type: el.type || '', className: typeof el.className === 'string' ? el.className.slice(0, 120) : '', + // eslint-disable-next-line unicorn/prefer-dom-node-text-content -- intentional: innerText returns visible text only text: (el.innerText || el.textContent || '').slice(0, 120).trim(), value: el.value !== undefined ? String(el.value).slice(0, 60) : '', href: el.href || '', @@ -67,35 +70,6 @@ return els } - /** - * Get this frame's embedding rect relative to its parent viewport. - * - * NOTICE: Cross-origin frames may not expose `window.frameElement`. - * In that case we return null and let the adapter skip those frame-local - * coordinates rather than projecting them incorrectly onto the desktop. - */ - function _getFrameRect() { - try { - if (window.top === window) - return null - - const frameEl = window.frameElement - if (!(frameEl instanceof Element)) - return null - - const r = frameEl.getBoundingClientRect() - return { - x: Math.round(r.left), - y: Math.round(r.top), - w: Math.round(r.width), - h: Math.round(r.height), - } - } - catch { - return null - } - } - // ---- Core API (read-only) ---- const __AIRI_DG__ = { @@ -112,8 +86,8 @@ return { url: location.href, title: document.title || '', + // eslint-disable-next-line unicorn/prefer-dom-node-text-content -- intentional: innerText returns visible text only bodyText: includeText ? (document.body ? document.body.innerText || '' : '').slice(0, 3000) : '', - frameRect: _getFrameRect() || undefined, interactiveElements: _collectInteractiveElements(maxElements), } }, @@ -156,6 +130,10 @@ /** * Get the center point of an element for click targeting. * Returns the element description with center coordinates. + * + * Coordinates are exposed both at the top level (x, y) and under + * `center` for backward compatibility. The extension bridge reads + * top-level x/y via unwrapResultPayload. */ getClickTarget(selector) { try { @@ -163,13 +141,16 @@ if (!el) return { success: false, error: 'not found' } const r = el.getBoundingClientRect() + const x = Math.round(r.left + r.width / 2) + const y = Math.round(r.top + r.height / 2) return { success: true, element: _describeElement(el), - center: { - x: Math.round(r.left + r.width / 2), - y: Math.round(r.top + r.height / 2), - }, + // Top-level x/y are read by extension-bridge.ts → clickSelector + x, + y, + // Keep center for any callers that read it directly + center: { x, y }, } } catch (e) { @@ -195,12 +176,227 @@ return { success: false, error: e.message } } }, + + /** + * Set the value of a text input or textarea via the DOM. + * Dispatches input + change events so frameworks (React, Vue, etc.) detect + * the change. Optionally blurs the element when done. + */ + setInputValue(selector, value, opts) { + try { + opts = opts || {} + // TODO: opts.simulateKeystrokes is accepted but ignored — we always do + // a single direct value assignment. Implement per-character KeyboardEvent + // dispatch for autocomplete/masker/validation flows that depend on keydown/keyup. + const el = document.querySelector(selector) + if (!el) + return { success: false, error: 'not found' } + // NOTICE: must pick the setter matching the element's prototype — + // calling HTMLInputElement.prototype.value.set on a